### Embedding Model

In [4]:
# Setup
# ! pip3 install -U sentence-transformers
# ! pip3 install datasets
# ! pip3 install 'transformers[torch]' accelerate -U

In [14]:
from sentence_transformers import SentenceTransformer, models

sentences = ["This is an example sentence"]

# sentence-transformers model that maps sentences and paragraphs to a 384-dimensional dense vector space
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')
embeddings = embedding_model.encode(sentences)
print(embeddings)

[[-2.02598123e-04  8.14801604e-02  3.13617624e-02  2.92063178e-03
   2.61564553e-02  2.90739760e-02  7.82618672e-02 -1.80423632e-03
   1.01344332e-01 -4.51711416e-02  5.84349632e-02 -1.53201362e-02
   5.49956262e-02 -9.86435190e-02 -3.50252688e-02  8.45673401e-03
   1.58608239e-02  1.05627300e-02 -3.42710093e-02 -4.75064246e-03
   9.99022350e-02 -2.06018798e-02 -4.47837934e-02  3.12135480e-02
  -1.19240722e-02 -5.15015535e-02 -1.33605665e-02  1.89621411e-02
   9.76810232e-02 -5.44111170e-02 -3.43313850e-02  8.12905282e-02
   4.88119759e-02 -1.10284155e-02  2.13518068e-02  1.27189662e-02
  -1.43967597e-02  3.62864062e-02 -7.61233494e-02  3.23294066e-02
   2.08102558e-02 -4.22015898e-02  9.12907273e-02  2.08530165e-02
  -3.08016930e-02 -8.38505253e-02  1.30891204e-02 -3.00631002e-02
   4.11229171e-02 -1.27495363e-01 -7.78027102e-02 -3.93412188e-02
   1.52594328e-03 -2.80108247e-02  3.41662578e-02  1.46712493e-02
  -7.71653429e-02  1.63619548e-01  4.11295332e-02 -5.24460115e-02
  -4.18772

### Embedding Fine-Tuning

In [60]:
# Setup
# ! pip3 install -U groq

# Create account
# 1. sign-up at https://console.groq.com/login
# 2. create API Key

from groq import Client

groq_api_key = "api_key"
client = Client(api_key=groq_api_key)

# example
chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "system",
            "content": "Come up with a question from a given product. Output just the question."
        },
        {
            "role": "user",
            "content": "Our Densifique hair collection is formulated to increase hair density. This range is excellent for thinning hair, hair strengthening and as an effective hair texturizer.",
        }
    ],
    model="llama3-8b-8192",
)

print(chat_completion.choices[0].message.content)

What is the main benefit of the Densifique hair collection?


In [15]:
import os, gzip, json

def locale_filter(data, locale='en_US'):
    return [data['value'] for data in data if data.get('language_tag') == locale]

def get_product_descriptions(product, locale='en_US'):
    descriptions = locale_filter(product.get('product_description', []), locale)
    return descriptions

def get_product_bullet_points(product, locale='en_US'):
    bullet_points = locale_filter(product.get('bullet_point', []), locale)
    return bullet_points

def get_product_name(product, locale='en_US'):
    item_name = locale_filter(product.get('item_name', []), locale)
    return item_name[0] if item_name else ''

def file_iterator(file_path):
    with gzip.open(file_path, 'rt', encoding='utf-8') as f:
        for line in f:
            yield json.loads(line)

# convert to structured product document
def get_product_doc(product):
    name = get_product_name(product)
    bullet_points = get_product_bullet_points(product)
    descriptions = get_product_descriptions(product)
    doc = {}
    if name:
        text = []
        text.append("Product Name: %s" % name)
        if bullet_points:
            text.append("- bullet points: %s" % ','.join(bullet_points))
        if descriptions:
            text.append("- description: %s" % ','.join(descriptions))
        doc[name] = '\n'.join(text)
    return doc

def process_json_gz(file_path):
    text = []
    products = file_iterator(file_path)
    for product in products:
        doc = get_product_doc(product)
        if doc: text.append(doc)
    return text


all_files = os.listdir("abo-listings/listings/metadata")

list_data = []
for file in all_files:
    file_path = 'abo-listings/listings/metadata/' + file
    data = process_json_gz(file_path)
    list_data.extend(data)

print(f"created {len(list_data)} product documents")

created 26424 product documents


In [99]:
# generate synthetic queries using LLM
# Groq limits 30 requests / minute
# Let's run it for 30 minutes: 900 total requests
# If we send it 25 descriptions per request
# total, 22500 training samples
import json

request_per_hour = 900
window_size = 25
total_samples = request_per_hour * window_size
samples = list_data[:total_samples]
train_data = []

def product_doc(window):
    doc = {}
    for product in window:
        doc.update(product)
    return doc

def format_training_data(llm_answer, products):
    questions = []
    try:
        json_str = llm_answer.split('```')[1]
        qa = json.loads(json_str)
        for pair in qa:
            name = pair['name']
            if name in products:
                question = {
                    'q': pair['question'],
                    'a': products[name],
                }
                questions.append(question)
    except Exception as e:
        print(e)
    return questions

for i in range(total_samples - window_size + 1):
    window = product_doc(samples[i: i + window_size])
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": "Come up with one question for each of the product. Output as json format: ```{ name: [product name], question: [question] }```.",
            },
            {
                "role": "user",
                "content": "\n".join(window.values()),
            }
        ],
        model="llama3-8b-8192",
    )
    
    data = format_training_data(chat_completion.choices[0].message.content, window)
    train_data.extend(data)

print(f"created {len(train_data)} question & answer pairs")

Expecting ',' delimiter: line 40 column 97 (char 1832)
list index out of range
Expecting ',' delimiter: line 60 column 91 (char 2565)
Expecting value: line 68 column 14 (char 2577)
Expecting ',' delimiter: line 8 column 74 (char 235)
list index out of range
list index out of range
list index out of range
Expecting ',' delimiter: line 20 column 84 (char 689)
Expecting ',' delimiter: line 16 column 85 (char 710)
Expecting ',' delimiter: line 12 column 91 (char 435)
list index out of range
Expecting value: line 64 column 13 (char 2717)
Expecting ',' delimiter: line 4 column 76 (char 82)
Expecting value: line 24 column 14 (char 805)
list index out of range
Expecting ',' delimiter: line 80 column 45 (char 3246)
Expecting ',' delimiter: line 4 column 80 (char 86)
Expecting ',' delimiter: line 68 column 45 (char 2523)
list index out of range
Expecting property name enclosed in double quotes: line 32 column 76 (char 1298)
Expecting property name enclosed in double quotes: line 28 column 76 (ch

RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-8b-8192` in organization `org_01jagmgwcafe7t4whts5etten7` on : Limit 500000, Used 497987, Requested 2763. Please try again in 2m9.4316s. Visit https://console.groq.com/docs/rate-limits for more information.', 'type': '', 'code': 'rate_limit_exceeded'}}

In [1]:
# example training data

# train_data = [{'q': 'What is the spice level of this mustard?',
#   'a': "Product Name: Wickedly Prime Mustard, White Wine Jalapeno, 11.75 Ounce\n- bullet points: One 11.75-ounce pastic squeeze bottle,Spice Level: Medium,Packed on shared equipment with egg, wheat, soy, milk, fish,Shown as serving suggestion,Satisfaction Guarantee: We're proud of our products. If you aren't satisfied, we'll refund you for any reason within a year of purchase. 1-877-485-0385,An Amazon brand"},
#  {'q': 'What are the unique characteristics of each gemstone in this jewelry piece?',
#   'a': 'Product Name: 10k White Gold Swiss-Blue-Topaz and Diamond Accent Flame Drop Earrings\n- bullet points: An elegant design highlights colorful gemstone and diamond accents,Lever-back closure,The natural properties and composition of mined gemstones define the unique beauty of each piece. The image may show slight differences to the actual stone in color and texture.,Gemstones may have been treated to improve their appearance or durability and may require special care.,All our diamond suppliers confirm that they comply with the Kimberley Process to ensure that their diamonds are conflict free.,Imported'},
#  {'q': 'What materials are used in the construction of this loveseat?',
#   'a': 'Product Name: Amazon Brand – Rivet Canton Deep Mid-Century Modern Loveseat Sofa Couch, 76.7"W, White\n- bullet points: Crafted with postmodern flair, this creamy white loveseat with tapered wooden legs will enliven any modern living space while also providing comfort and durability. Easy to clean and maintain, this is a loveseat for the modern home.,76.7"W x 40.2"D x 34.3"H; seat height: 18.5"; seat depth: 32.3"; seat back height: 20.9"; arm height: 26.37"; leg height: 6.3",Solid wood frame and legs with 100% polyester upholstery,The tight, fixed back and removable cushions ensure durability and comfort.,This stunning work of craftsmanship takes the very best of modern design and reinvents it for the now.,Assemble in 15 minutes or less,Avoid moisture and wipe clean with soft, dry cloth.,Free returns for 30 days. 1-year warranty.'},
#  {'q': 'What are the features of this modern sectional sofa?',
#   'a': 'Product Name: Amazon Brand – Rivet Edgewest Low Back Modern Left U-Sectional, 117"W, Rouge Red\n- bullet points: Features such as track arms, box-edge framing and metal legs make this collection look sharp in modern or industrial settings. Back and seat cushions are reversible, and the pieces can be configured in many ways to accommodate family and friends.,117"W x 79"D x 32"H; Seat Height: 19"H ; Seat Depth: 23"D; Seat Back Height: 25"H; Arm Height: 25"H; Leg Height: 5.25"H,Polyester fabric on hardwood pine frame; metal legs with bronze finish,Crisp, clean lines and metal legs are hallmarks of contemporary style,Carefully crafted with a solid wood frame to ensure durability,Easy assembly in less than 15 minutes; just screw in legs,Free returns for 30 days. 1-year warranty.'},
#  {'q': 'What are the features and benefits of this ballet flat?',
#   'a': "Product Name: Amazon Essentials Women's Ballet Flat, Leopard Knit, 6.5 B US\n- bullet points: Classic and versatile ballet flat designed for daily wear and superior fit; a staple in every woman’s wardrobe,Casual silhouette with round toe shape and flattering profile and soft, mesh upper with gently elasticized topline,Sweat absorbent faux leather sock with natural hand and feel and padded insole with 8mm high quality memory for supreme cushioning and comfort,Genuine leather counter lining with “Heel-Pillow” comfort technology,Ultra flexible, lightweight, and durable TR outsole with minimalist heel, and traction grip forepart"},
#  {'q': 'What are the nutritional benefits and certifications of these almonds?',
#   'a': 'Product Name: 365 Everyday Value, Almonds, Roasted & Salted, 8 oz\n- bullet points: Good source of fiber.,Excellent source of Vitamin E & Magnesium.,Made with sea salt.,Certified GMO-Free, Certified Kosher, Certified Vegan.,Brought to you by Whole Foods Market. Our standards are what set us apart, and our quality is what keeps us stocking pantries, fridges and freezers with the best natural and organic 365 Everyday Value products every day.'},
#  {'q': 'What are the coffee beans used in this medium roast coffee?',
#   'a': 'Product Name: AmazonFresh Hazelnut Flavored Coffee, Ground, Medium Roast, 1.75 Ounce\n- bullet points: Fragrant medium-light roast with toasted hazelnut flavor,One 1.75-ounce bag of ground coffee,100% Arabica coffee grown in Central and South America,Roasted and packed in the U.S.A.,Shown as a serving suggestion'},
#  {'q': 'What are the features and benefits of this shredder and lubricant set?',
#   'a': 'Product Name: AmazonBasics 6-Sheet Cross-Cut Paper and Credit Card Shredder and Aleratec Shredder Lubricant Sheets (Pack of 12) Set\n- bullet points: Includes an AmazonBasics 6-Sheet Cross-Cut Paper and Credit Card Shredder and Aleratec Shredder Lubricant Sheets (Pack of 12),Cross-cut paper shredder with 6-sheet capacity; destroys credit cards (one at a time),Shreds paper into strips measuring 7/32 by 1-27/32 inches; Auto start and manual reverse to clear paper jams,Shredder lubricant sheets reduce blade friction optimizing shredder performance; Sheet Size: 8 1/2" x 6",Lubricant sheets made with sugar cane pulp,vegetable oil (soybean oil) and recycled paper'},
#  {'q': 'What are the origins and certifications of these whole black peppercorns?',
#   'a': 'Product Name: 365 EVERYDAY VALUE Whole Black Peppercorns, 1.87 OZ\n- bullet points: Brought to you by Whole Foods Market.\xa0 The packaging for this product has a fresh new look. During this transition, you may get the original packaging or the new packaging in your order, but the product and quality is staying exactly the same. Enjoy!,Product of Vietnam.,Certified GMO-Free, Certified Kosher.'},
#  {'q': 'What are the characteristics and certifications of this olive oil?',
#   'a': 'Product Name: Whole Foods Market, Extra Virgin Olive Oil of Portugal, 33.8 fl oz\n- bullet points: Made exclusively with arbequina olives grown in Portugal, this oil owes its distinctive character to rich soil that has cultivated olives for thousands of years. Fruity yet smooth with notes of fresh almonds.,This oil is best used raw in order to preserve its delicate properties.,Certified GMO-Free, Certified Kosher, Certified Vegan.,Brought to you by Whole Foods Market. When it comes to innovative flavors and products sourced from artisans and producers around the world, the Whole Foods Market brand has you covered. Amazing products, exceptional ingredients, no compromises.,North American Olive Oil Association Certified Quality. Tested for quality to meet international olive oil standards. Product of Portugal.'},
#  {'q': 'What are the benefits and features of this omega-3 supplement?',
#   'a': 'Product Name: Whole Foods Market, Omega Joint Care, 90 ct\n- bullet points: Brought to you by Whole Foods Market,1065mg total omega-3s,Essential fatty acids'},
#  {'q': 'What are the features and benefits of these removable round labels?',
#   'a': 'Product Name: AmazonBasics Removable Round Labels, 1-Inch Diameter, White, Pack of 945\n- bullet points: Sticky-back, white, removable round labels formatted for laser and inkjet printers,Includes 15 sheets ; 63 labels per sheet; 945 labels per package,Ideal for sealing envelopes, making branded labels, personalized stickers, and more,Permanent adhesive bonds easily to paper products and other surfaces,Individual labels measure 1.5 inches in diameter each; printable sheet is 8.5 x 11 inches,Avery reference #: 6450'},
#  {'q': 'What are the characteristics and care instructions for this platinum-plated ring?',
#   'a': 'Product Name: Platinum Plated Sterling Silver Swarovski Zirconia Round Cut Halo Ring\n- bullet points: These silver pieces are built for longevity. This piece features a metal plating or flashing, or an electrocoating for a more lustrous appearance, but it can wear off with long-term or heavy use. To ensure the longevity of your plated items store your jewelry in a dark, cool, dry place such as a pouch or air tight box and avoid rubbing plated items together.\xa0Also try to avoid exposure to cleaning products and perfume which can both negatively affect your items. Your local jeweler can advise you where to send your jewelry if you would ever like them replated.'},
#  {'q': 'What are the origins and certifications of these poppy seeds?',
#   'a': 'Product Name: 365 Everyday Value, Poppy Seed, 2.4 oz\n- bullet points: Brought to you by Whole Foods Market.  Our standards are what set us apart, and our quality is what keeps us stocking pantries, fridges and freezers with the best natural and organic 365 Everyday Value products every day.,Product of Turkey.,Certified GMO-Free, Certified Kosher.'},
#  {'q': 'What are the features and benefits of this sectional sofa?',
#   'a': 'Product Name: Amazon Brand – Stone & Beam Andover Slipcover Right-Facing L-Shaped Sectional, 124"W, Stone Fabric\n- bullet points: Complete your living space with this stylish sectional. With its stone-colored upholstery, this sectional is an ideal accent for transitional-style home decor. The solid-wood frame is stable, and reversible cushions enhance durability and extend this sectional’s life.,Overall: 124"W x 96"D x 33"H; seat height: 21"H; seat depth: 25"D; seat back height: 28"H; arm height: 28"H; leg height: 4.25"H,Sturdy wood frame with stain-resistant fabric,Transitional style blends well with a variety of home-decor themes.,Back and seat cushions are reversible so you can choose the configuration that\'s right for you.,Assemble in 15 minutes or less,Free returns for 30 days. 3-year warranty.,This item is made to order just for you.'},
#  {'q': 'What are the features and benefits of this hemp-colored barstool?',
#   'a': 'Product Name: Amazon Brand – Stone & Beam Esme Memory-Swivel Barstool, 43.3"H, Hemp\n- bullet points: Add some stylish seating to your next get-together or party with this wingback barstool. This tall hemp-colored barstool features a memory swivel that returns the seat to its proper position every time. It will easily complement your contemporary kitchen or transitional bar.,20.1"W x 22.4"D x 43.3"H; seat height: 29.9"H; seat depth: 16.5"D; seat back height: 18.5"H; leg height: 24.8"H,Solid hardwood frame with moisture-repellent, stain-resistant fabric,A memory-swivel mechanism means this barstool always looks tidy.,Classic transitional look with memory swivel,No assembly required,Avoid moisture. Wipe clean with a soft, dry cloth.,Free returns for 30 days. 3-year warranty.'},
#  {'q': 'What are the ingredients and features of this Thai-style coconut chicken soup?',
#   'a': 'Product Name: Wickedly Prime Thai-Style Coconut Chicken Soup, 17 Ounce\n- bullet points: One 17-ounce carton,Contains coconut,Bean sprouts are serving suggestion only,An Amazon brand,One 17-ounce carton,Contains coconut,Bean sprouts are serving suggestion only,An Amazon brand'},
#  {'q': 'What are the features and benefits of this commercial-grade replacement cushion?',
#   'a': 'Product Name: Strathwood Chaise Lounge Replacement Cushion, Daiquri\n- bullet points: Commercial grade, built for superiority and strength'},
#  {'q': 'What are the features and certifications of this mild cheddar cheese bar?',
#   'a': 'Product Name: 365 by Whole Foods Market, Cheese Bar, Mild Cheddar, 8 Ounce (Packaging May Vary)\n- bullet points: Brought to you by Whole Foods Market. The packaging for this product has a fresh new look. During this transition, you may get the original packaging or the new packaging in your order, but the product and quality is staying exactly the same. Enjoy!,Produced Without Added Hormones (made with milk from cows not treated with rBST).,Keep Refrigerated.,Certified Vegetarian.'},
#  {'q': 'What are the features and benefits of this silver leaning garment rack?',
#   'a': 'Product Name: AmazonBasics Leaning Garment Rack, Silver\n- bullet points: Wall-leaning garment rack for clothing, linens, jackets, and more,Sturdy wooden bar for hangers or draping,Use in the bedroom, laundry room, foyer, or anywhere garment hanging is useful,Steel tube construction offers both durability and a lightweight design,Minimal, modern look that easily complements existing décor; silver color,Easy assembly'},
#  {'q': 'What are the features and benefits of this firm styling gel?',
#   'a': "Product Name: Amazon Brand - Solimo Men's Firm Styling Gel, 8.4oz (Pack of 3)\n- bullet points: Three 8.4-fluid ounce bottles of Solimo Men's Firm Styling Gel,Satisfaction Guarantee: We're proud of our products. If you aren't satisfied, we'll refund you for any reason within a year of purchase. 1-877-485-0385,An Amazon Brand"},
#  {'q': 'What are the features and benefits of this dark brown Adirondack chair?',
#   'a': 'Product Name: Strathwood Adirondack Chair with Cupholder, Dark Brown\n- bullet points: Classically designed Adirondack chair offers style and comfort,Made of FSC-certified eucalyptus; available in chocolate brown stain or white painted finish,Vertically slatted backrest, curved seat, and oversized armrests with cup holder,Minimal assembly required; coordinates with matching ottoman (sold separately),Measures 34 inches long by 27.5 inches wide by 37.4 inches high'},
#  {'q': 'What are the features and certifications of these USDA-organic scuba snacks?',
#   'a': 'Product Name: 365 Everyday Value Featuring Wild Kratts, Organic Scuba Snacks, White Cheddar, 4 oz\n- bullet points: Brought to you by Whole Foods Market.  Our standards are what set us apart, and our quality is what keeps us stocking pantries, fridges and freezers with the best natural and organic 365 Everyday Value products every day.,Made with real white cheddar cheese.,No sugar added.,Certified Organic, Certified Gluten Free, Certified Kosher, Certified Vegetarian.'}]


In [2]:
from sentence_transformers import InputExample
from torch.utils.data import DataLoader

train_examples = []
for data in train_data:
  train_examples.append(InputExample(texts=[data['q'], data['a']]))

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=8)

  from tqdm.autonotebook import tqdm, trange


In [4]:
from sentence_transformers import losses

train_loss = losses.MultipleNegativesRankingLoss(model=embedding_model)

In [5]:
# fine-tune
embedding_model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=100,
    show_progress_bar=True,
)

# save the fine-tuned model
embedding_model.save("all-MiniLM-L12-v2-abo", safe_serialization=False)

Step,Training Loss


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

### Vector Index

In [10]:
# Setup
# ! pip3 install -U pinecone

# Create Index
# 1. sign-up at https://www.pinecone.io/
# 2. create index named "product" with dimensions "384"
# 3. choose cloud provider (I chose AWS us-east-1 region)

In [16]:
from pinecone import Pinecone, ServerlessSpec

pinecone_api_key = 'my_key'
pc = Pinecone(api_key=pinecone_api_key)

# Create Index
index_name = "product"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(
            cloud=1
            region=2
        )
    )

pc_client = pc.Index(index_name)

In [22]:
must_have_keys = set(['item_id', 'brand', 'item_name', 'item_keywords', 'bullet_point'])
key_filter =  must_have_keys.union(set('product_description'))

def get_product_item(product, key_filter=key_filter, locale='en_US'):
    item = {}
    for key in key_filter:
        if key == 'item_id':
            item[key] = product[key]
        else:
            values = locale_filter(product.get(key, []), locale)
            if values:
                item[key] = ' '.join(values).strip()
    return item

def json_gz2product(file_path):
    data = []
    products = file_iterator(file_path)
    for product in products:
        # at least 'item_id', 'brand', 'item_name', 'item_keywords', 'bullet_point'
        item = get_product_item(product)
        if item.keys() == must_have_keys:
            data.append(item)
    return data

data = []
all_files = os.listdir("abo-listings/listings/metadata")
for file in all_files:
    file_path = 'abo-listings/listings/metadata/' + file
    products = json_gz2product(file_path)
    data.extend(products)

sentences = []
for item in data:
    sentences.append(' '.join([item.get('item_name', ''), item.get('bullet_point', ''), item.get('product_description', '')]).strip())
embeddings =  embedding_model.encode(sentences)  

vectors = []
for d, e in zip(data, embeddings):
    vectors.append({
        "id": d['item_id'],
        "values": e,
        "metadata": {'item_name': d['item_name']},
    })


window_size = 100
total_samples = 19900
vector_samples = vectors[:total_samples]

# index vectors to Pinecone
for i in range(total_samples - window_size + 1):
    window = vector_samples[i: i + window_size]
    pc_client.upsert(
        vectors=window,
        namespace="ns1"
    )

In [89]:
import sys

query = "Find me a kitchen table"
query_embedding = embedding_model.encode(query).tolist()
print(f'string query: {sys.getsizeof(query)} bytes, query embedding: {sys.getsizeof(query_embedding)} bytes\n')

results = pc_client.query(
    namespace="ns1",
    vector=query_embedding,
    top_k=3,
    include_values=False,
    include_metadata=True
)

sorted_matches = sorted(results['matches'], key=lambda x: x['score'], reverse=True)
print(sorted_matches)

string query: 72 bytes, query embedding: 3128 bytes

[{'id': 'B07B78LZQC',
 'metadata': {'item_name': 'Amazon Brand – Rivet Ian Modern Wood Round Dining '
                           'Room Kitchen Table, 42"W, Brown'},
 'score': 0.575852633,
 'values': []}, {'id': 'B07B82PXD8',
 'metadata': {'item_name': 'Amazon Brand – Rivet Ian Modern Medium Dining '
                           'Kitchen Table, Expandable, 60-80"L, Brown'},
 'score': 0.556741416,
 'values': []}, {'id': 'B07B87J7L3',
 'metadata': {'item_name': 'Amazon Brand – Stone & Beam Hughes Round Wood '
                           'Dining Kitchen Table, Expandable, Brown'},
 'score': 0.550611913,
 'values': []}]


In [29]:
sorted_matches

[{'id': 'B07B78LZQC',
  'metadata': {'item_name': 'Amazon Brand – Rivet Ian Modern Wood Round Dining '
                            'Room Kitchen Table, 42"W, Brown'},
  'score': 0.575852633,
  'values': []},
 {'id': 'B07B82PXD8',
  'metadata': {'item_name': 'Amazon Brand – Rivet Ian Modern Medium Dining '
                            'Kitchen Table, Expandable, 60-80"L, Brown'},
  'score': 0.556741416,
  'values': []},
 {'id': 'B07B87J7L3',
  'metadata': {'item_name': 'Amazon Brand – Stone & Beam Hughes Round Wood '
                            'Dining Kitchen Table, Expandable, Brown'},
  'score': 0.550611913,
  'values': []}]

### Hybrid Search

In [31]:
# setup Elasticsearch for full-text search

from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

es_client = Elasticsearch(
    "http://localhost:9200",
    basic_auth=("elastic", "password"),
    verify_certs=False,
    ssl_show_warn=False
)

index = 'product'

mapping = {
    "mappings": {
        "properties": {
            "item_id": {"type": "text"},
            "brand": {"type": "text"},
            "item_name": {"type": "text"},
            "item_keywords": {"type": "text"},
            "bullet_point": {"type": "text"},
            "product_description": {"type": "text"},
        }
    }
}

es_client.indices.create(index=index, body=mapping)

documents = []
data_sample = data[:total_samples]
for i in range(total_samples):
    data = data_sample[i]
    documents.append({
        '_index': index,
        '_id': i+1,
        '_source': {
            'item_id': data.get('item_id', ''),
            'brand': data.get('brand', ''),
            'item_name': data.get('item_name', ''),
            'item_keywords': data.get('item_keywords', ''),
            'bullet_point': data.get('bullet_point', ''),
            'product_description': data.get('product_description', ''),
        }
    })

success, _ = bulk(es_client, documents)
print(f"Successfully indexed {success} documents")

Successfully indexed 19900 documents


In [77]:
# test text search

query = {
    "query": {
        "query_string": {
            "query": "Find me a kitchen table",
        }
    }
}

result = es_client.search(index=index, body=query)
hits = result['hits']
print([hit['_source'] for hit in hits['hits']])

[{'item_id': 'B0725Z3LDP', 'brand': 'find.', 'item_name': "find. Women's 123265 Boots, Red, 8 US", 'item_keywords': 'women woman Find Thigh High Boot boots boots for women boots women shoes shoes women stiletto boots womens ankle boots womens fashion womens shoes womens stilettos womenswear Find Thigh High Boot boots boots for women boots women shoes shoes women stiletto boots womens ankle boots womens fashion womens shoes womens stilettos womenswear Find Thigh High Boot boots boots for women boots women shoes shoes women stiletto boots womens ankle boots womens fashion womens shoes womens stilettos womenswear Find Thigh High Boot boots boots for women boots women shoes shoes women stiletto boots womens ankle boots womens fashion womens shoes womens stilettos womenswear Find Thigh High Boot boots boots for women boots women shoes shoes women stiletto boots womens ankle boots womens fashion womens shoes womens stilettos womenswear Find Thigh High Boot boots boots for women boots women s

In [80]:
import time
from collections import defaultdict

def text_search(query):
    q = {
        "query": {
            "query_string": {
                "query": query,
            }
        }
    }

    result = es_client.search(index=index, body=q)
    hits = result['hits']
    return [hit['_source'] for hit in hits['hits']]

def vector_search(query, topk=3):
    query_embedding = embedding_model.encode(query).tolist()

    results = pc_client.query(
        namespace="ns1",
        vector=query_embedding,
        top_k=topk,
        include_values=False,
        include_metadata=True
    )
    
    sorted_matches = sorted(results['matches'], key=lambda x: x['score'], reverse=True)
    return sorted_matches

def reciprocal_rank_fusion(results, K=60):
    docs = []
    rrf_score = []
    
    for ranked_doc in results:
        for rank, doc in enumerate(ranked_doc, 1):
            docs.append(doc)
            rrf_score.append(1.0 / (rank + K))

    scored_docs = zip(docs, rrf_score)
    sorted_docs = sorted(scored_docs, key=lambda x: x[1], reverse=True)
    return sorted_docs

def hybrid_search(query, topk=3):
    results = [vector_search(query, topk), text_search(query)]
    ranked_results = reciprocal_rank_fusion(results)
    return ranked_results

start_time = time.time()
results = hybrid_search("Find me a kitchen table")
print(f"Done in {time.time() - start_time} seconds")
print(results)

Done in 0.7587382793426514 seconds
[({'id': 'B07B78LZQC',
 'metadata': {'item_name': 'Amazon Brand – Rivet Ian Modern Wood Round Dining '
                           'Room Kitchen Table, 42"W, Brown'},
 'score': 0.575852633,
 'values': []}, 0.01639344262295082), ({'item_id': 'B0725Z3LDP', 'brand': 'find.', 'item_name': "find. Women's 123265 Boots, Red, 8 US", 'item_keywords': 'women woman Find Thigh High Boot boots boots for women boots women shoes shoes women stiletto boots womens ankle boots womens fashion womens shoes womens stilettos womenswear Find Thigh High Boot boots boots for women boots women shoes shoes women stiletto boots womens ankle boots womens fashion womens shoes womens stilettos womenswear Find Thigh High Boot boots boots for women boots women shoes shoes women stiletto boots womens ankle boots womens fashion womens shoes womens stilettos womenswear Find Thigh High Boot boots boots for women boots women shoes shoes women stiletto boots womens ankle boots womens fashio

### Hybrid Search + LLM Reranker

In [84]:
# Reranker Model
# https://huggingface.co/BAAI/bge-reranker-v2-m3#model-list
# ! pip3 install -U FlagEmbedding peft

from FlagEmbedding import FlagLLMReranker
reranker = FlagLLMReranker('BAAI/bge-reranker-v2-m3', use_fp16=True)

If you want to use `XLMRobertaLMHeadModel` as a standalone, add `is_decoder=True.`
Some weights of XLMRobertaForCausalLM were not initialized from the model checkpoint at BAAI/bge-reranker-v2-m3 and are newly initialized: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [87]:
def get_product_str(product):
    name = product['item_name']
    bullet_points = product['bullet_point']
    descriptions = product['product_description']
    text = []
    if name:
        text.append("Product Name: %s" % name)
        if bullet_points:
            text.append("- bullet points: %s" % ','.join(bullet_points))
        if descriptions:
            text.append("- description: %s" % ','.join(descriptions))
    return '\n'.join(text)

def llm_reranker(query, docs):
    pairs = [(query, get_product_str(doc)) for doc in docs]
    scores = reranker.compute_score(pairs)
    scored_docs = zip(docs, scores)
    sorted_docs = sorted(scored_docs, key=lambda x: x[1], reverse=True)
    return sorted_docs

def fetch_doc(results):
    ids = []
    for result in results[0]:
        ids.append(result['id'])
    for result in results[1]:
        ids.append(result['item_id'])
    q = {
        "query": {
            "query_string": {
                "query": " OR ".join(ids),
                "default_field": "item_id",
            }
        }
    }
    result = es_client.search(index=index, body=q)
    hits = result['hits']
    return [hit['_source'] for hit in hits['hits']]

def hybrid_search2(query, topk=3):
    results = [vector_search(query, topk), text_search(query)]
    docs = fetch_doc(results)
    ranked_results = llm_reranker(query, docs)
    return ranked_results

start_time = time.time()
results = hybrid_search2("Find me a kitchen table")
print(f"Done in {time.time() - start_time} seconds")
print(results)

  0%|                                                                                                                                                                                                                                 | 0/1 [00:00<?, ?it/s]You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:08<00:00,  8.41s/it]

Done in 9.356230020523071 seconds
[({'item_id': 'B0725Z3LDP', 'brand': 'find.', 'item_name': "find. Women's 123265 Boots, Red, 8 US", 'item_keywords': 'women woman Find Thigh High Boot boots boots for women boots women shoes shoes women stiletto boots womens ankle boots womens fashion womens shoes womens stilettos womenswear Find Thigh High Boot boots boots for women boots women shoes shoes women stiletto boots womens ankle boots womens fashion womens shoes womens stilettos womenswear Find Thigh High Boot boots boots for women boots women shoes shoes women stiletto boots womens ankle boots womens fashion womens shoes womens stilettos womenswear Find Thigh High Boot boots boots for women boots women shoes shoes women stiletto boots womens ankle boots womens fashion womens shoes womens stilettos womenswear Find Thigh High Boot boots boots for women boots women shoes shoes women stiletto boots womens ankle boots womens fashion womens shoes womens stilettos womenswear Find Thigh High Boot 


