# DEMO: semantic search by using vector embeddings

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [13]:
# Install dependencies.
!pip install asyncpg
!pip install numpy pandas
!pip install pgvector
!pip install langchain transformers sentence_transformers
!pip install Elasticsearch==7.17.9
!pip install markdownify
!pip install sklearn

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [75]:
# Automatically restart kernel after installs so that your environment
# can access the new packages.
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

In [61]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan as escan
import pandas as pd
from collections import deque

# Elasticsearch configuration
host = os.environ.get("ELASTICSEARCH_HOST")
username = os.environ.get("ELASTICSEARCH_USERNAME")
password = os.environ.get("ELASTICSEARCH_PASSWORD")
es = Elasticsearch([host], http_auth=(username, password))
index = 'pages_alias'


query = {
  "query": {
    "bool": {
      "must": [
        {
          "range": {
            "savedAt": {
              "gte": "now-7d/d",
              "lte": "now/d",
              "format": "yyyy-MM-dd"
            }
          }
        },
        {
          "term": {
            "userId": {
              "value": os.environ.get("USER_ID")
            }
          }
        },
        {
          "exists": {
            "field": "rssFeedUrl"
          }
        }
      ]
    }
  },
  "_source": {
    "includes": [
      "id",
      "title",
      "author",
      "description",
      "content",
      "readingProgressPercent"
    ]
  },
  "sort": [
    {
      "savedAt": {
        "order": "desc"
      }
    }
  ]
}
# Scan API for larger library
response = escan(client=es, index=index, query=query, request_timeout=30, size=100)

# Initialize a double ended queue
output = deque()
# Extend deque with iterator
output.extend(response)
# Convert deque to DataFrame
# df = json_normalize(output)
# df = df[[x for x in df.columns if "_source." in x]]

# Search API
# search_result = es.search(index=index, body=query)
# hits = search_result["hits"]["hits"]
results = []
for hit in output:
    source = hit["_source"]
    source["id"] = hit["_id"]
    source["progress"] = source["readingProgressPercent"]
    # description could be null
    if 'description' not in source or source['description'] is None:
        source['description'] = ''
    # author could be null
    if 'author' not in source or source['author'] is None:
        source['author'] = ''
    results.append(source)
    
df = pd.DataFrame.from_records(results, exclude=['readingProgressPercent'])
df.head()

Unnamed: 0,author,description,id,title,content,progress
0,Deutsche Welle,The French embassy has said evacuation plans a...,lCdUsIkBwAZ9NKfgk1OK,Niger coup: France prepares evacuations – DW –...,"<div class=""page"" id=""readability-page-1""><div...",0.0
1,The Count,The new graphics card range will also come wit...,hflVsIkBfarSEDd1lMo-,Gigabyte debuts new Aorus Elite graphics cards...,"<div class=""page"" id=""readability-page-1""><div...",0.0
2,,"Unto all who come by these present letters, gr...",kidUsIkBwAZ9NKfge1PF,From their Highnesses,"<div class=""page"" id=""readability-page-1""><div...",0.0
3,Lee Moran,The former president has said he'll skip the F...,h_lVsIkBfarSEDd1ycoY,Trump Taunts GOP Rivals Over Debate With Vice ...,"<div class=""page"" id=""readability-page-1""><div...",0.0
4,Lee Moran,"The flashing sign may be gone, but won’t be fo...",nydVsIkBwAZ9NKfg0lN5,Elon Musk Ridiculed For Short Lived X Sign | H...,"<div class=""page"" id=""readability-page-1""><fig...",0.0


In [3]:
# Save the Pandas dataframe in a PostgreSQL table.
import asyncpg


async def main():
    # Create connection to SQL database
    conn = await asyncpg.connect(database='test')

    await conn.execute("DROP TABLE IF EXISTS test CASCADE")
    # Create the `test` table.
    await conn.execute(
        """CREATE TABLE test(
                    id VARCHAR(1024) PRIMARY KEY,
                    title TEXT,
                    author TEXT,
                    description TEXT,
                    progress NUMERIC,
                    content TEXT)"""
    )

    # Copy the dataframe to the `test` table.
    tuples = list(df.itertuples(index=False))
    await conn.copy_records_to_table(
        "test", records=tuples, columns=list(df), timeout=10
    )
    await conn.close()


# Run the SQL commands now.
await main()  # type: ignore

In [109]:
# convert html to markdown

from markdownify import markdownify as md

def html_to_markdown(html):
    return md(html)

Vector Embeddings

In [62]:
# Split long text descriptions into smaller chunks that can fit into
# the API request size limit, as expected by the LLM providers.

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    separators=[".", "\n"],
    chunk_size=500,
    chunk_overlap=0,
    length_function=len,
)
chunked = []
for index, row in df.iterrows():
    id = row["id"]
    title = row["title"]
    description = row["description"]
    author = row["author"]
    # content = html_to_markdown(row["content"])
    splits = text_splitter.create_documents([title])
    for s in splits:
        r = {"id": id, "content": s.page_content}
        chunked.append(r)

Step 2: Generate vector embedding for each chunk by calling an Embedding Generation service

In [63]:
# Generate the vector embeddings for each chunk of text.
# This code snippet may run for a few minutes.

from langchain.embeddings import HuggingFaceEmbeddings

model_name = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
embeddings_service = HuggingFaceEmbeddings( 
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

batch_size = 5
for i in range(0, len(chunked), batch_size):
    documents = [x["content"] for x in chunked[i : i + batch_size]]
    embedding = embeddings_service.embed_documents(documents)
    # Store the retrieved vector embeddings for each chunk back.
    for x, e in zip(chunked[i : i + batch_size], embedding):
        x["embedding"] = e

# Store the generated embeddings in a pandas dataframe.
embeddings = pd.DataFrame(chunked)
embeddings.head()

Unnamed: 0,id,content,embedding
0,lCdUsIkBwAZ9NKfgk1OK,Niger coup: France prepares evacuations – DW –...,"[0.02600826881825924, 0.015473218634724617, -0..."
1,hflVsIkBfarSEDd1lMo-,Gigabyte debuts new Aorus Elite graphics cards...,"[-0.003796227276325226, -0.028414282947778702,..."
2,kidUsIkBwAZ9NKfge1PF,From their Highnesses,"[0.041707273572683334, 0.03731207549571991, -0..."
3,h_lVsIkBfarSEDd1ycoY,Trump Taunts GOP Rivals Over Debate With Vice ...,"[0.011007378809154034, 0.03349965810775757, -0..."
4,nydVsIkBwAZ9NKfg0lN5,Elon Musk Ridiculed For Short Lived X Sign | H...,"[0.040546514093875885, -0.017225362360477448, ..."


In [None]:
# get token

In [71]:
from sklearn.cluster import KMeans
from sentence_transformers import util
import torch

# Perform kmean clustering
# num_clusters = 5
# clustering_model = KMeans(n_clusters=num_clusters)
# clustering_model.fit(embeddings['embedding'].to_list())
# cluster_assignment = clustering_model.labels_

# clustered_sentences = [[] for i in range(num_clusters)]
# for sentence_id, cluster_id in enumerate(cluster_assignment):
#     clustered_sentences[cluster_id].append(embeddings['content'][sentence_id])

# for i, cluster in enumerate(clustered_sentences):
#     print("Cluster ", i+1)
#     print(cluster)
#     print("")
    
#Two parameters to tune:
#min_cluster_size: Only consider cluster that have at least 25 elements
#threshold: Consider sentence pairs with a cosine-similarity larger than threshold as similar
# creating tensor from embeddings 
torch_tensor = torch.tensor(embeddings['embedding'])

clusters = util.community_detection(torch_tensor, min_community_size=10, threshold=0.5)

#Print for all clusters the top 3 and bottom 3 elements
for i, cluster in enumerate(clusters):
    print("\nCluster {}, #{} Elements ".format(i+1, len(cluster)))
    for sentence_id in cluster[0:3]:
        print("\t", embeddings['content'][sentence_id])
    print("\t", "...")
    for sentence_id in cluster[-3:]:
        print("\t", embeddings['content'][sentence_id])


Cluster 1, #82 Elements 
	 https://news.google.com/rss/articles/CBMif2h0dHBzOi8vd3d3LmRhaWx5bWFpbC5jby51ay9mZW1haWwvYXJ0aWNsZS0xMjM1OTA1OS9Db2NvLUV2ZS1iZXN0LWZha2UtdGFuLUF1c3RyYWxpYS1ib2R5LXNjcnViLW1vaXN0dXJpc2VyLXNtYWxsLWJ1c2luZXNzLmh0bWzSAYMBaHR0cHM6Ly93d3cuZGFpbHltYWlsLmNvLnVrL2ZlbWFpbC9hcnRpY2xlLTEyMzU5MDU5L2FtcC9Db2NvLUV2ZS1iZXN0LWZha2UtdGFuLUF1c3RyYWxpYS1ib2R5LXNjcnViLW1vaXN0dXJpc2VyLXNtYWxsLWJ1c2luZXNzLmh0bWw?oc=5
	 https://news.google.com/rss/articles/CBMingFodHRwczovL2dhbWVpc2hhcmQuZ2cvbmV3cy9vbmUtb2YtdGhlLWJlc3Qtc3RlYWx0aC1ob3Jyb3ItZ2FtZXMtYXJvdW5kLWhhcy1wYXNzZWQtdGhhdC1lYXJseS1hY2Nlc3MtdGhyZXNob2xkLWl0cy1kZWZpbml0ZWx5LXRpbWUtdG8tY2hlY2stb3V0LWdsb29td29vZC82OTg2L9IBAA?oc=5
	 https://www.politico.com/news/2023/08/01/maternal-health-care-crisis-00109106
	 ...
	 Here's What Happened Today: Monday · TheJournal.ie
	 The 8 at 8: Tuesday · TheJournal.ie
	 https://news.google.com/rss/articles/CBMiUmh0dHBzOi8vd3d3LmdhbWVzaHViLmNvbS9uZXdzL25ld3MvYWN0aXZpc2lvbi1ibGl6emFyZC1tZXJnZXItbmV

### Use pgvector to store the generated embeddings within PostgreSQL

- The `pgvector` extension introduces a new `vector` data type.
- **The new `vector` data type allows you to directly save a vector embedding (represented as a NumPy array) through a simple INSERT statement in PostgreSQL!**

>⚠️ The following code snippet may run for a few minutes.

In [112]:
# Store the generated vector embeddings in a PostgreSQL table.
# This code may run for a few minutes.

import numpy as np
from pgvector.asyncpg import register_vector


async def main():
    # Create connection to SQL database.
    conn = await asyncpg.connect(database='test')

    await conn.execute("CREATE EXTENSION IF NOT EXISTS vector")
    await register_vector(conn)

    await conn.execute("DROP TABLE IF EXISTS test_embeddings")
    # Create the `test_embeddings` table to store vector embeddings.
    await conn.execute(
        """CREATE TABLE test_embeddings(
                            test_id VARCHAR(1024) NOT NULL REFERENCES test(id),
                            content TEXT,
                            embedding vector(768))"""
    )

    # Store all the generated embeddings back into the database.
    for index, row in embeddings.iterrows():
        await conn.execute(
            "INSERT INTO test_embeddings (test_id, content, embedding) VALUES ($1, $2, $3)",
            row["id"],
            row["content"],
            np.array(row["embedding"]),
        )

    await conn.close()


# Run the SQL commands now.
await main()  # type: ignore

Step 1: Generate the vector embedding for the user query

In [154]:
user_query = 'sports'
min_progress = 0  # @param {type:"integer"}
max_progress= 100  # @param {type:"integer"}

qe = embeddings_service.embed_query(user_query)
qe

[-0.07225033640861511,
 -0.0016877002781257033,
 -0.00423727510496974,
 0.07335066795349121,
 0.0025142382364720106,
 -0.04136274382472038,
 -0.044574253261089325,
 0.03261534497141838,
 0.0007996244239620864,
 -0.024010641500353813,
 0.07381800562143326,
 -0.003088634228333831,
 -0.0035691699013113976,
 0.04320652782917023,
 0.04156501591205597,
 -0.013221917673945427,
 -0.053041424602270126,
 -0.021313371136784554,
 -0.017885055392980576,
 0.00309254159219563,
 -0.015640603378415108,
 0.0007444623042829335,
 -0.001701891073025763,
 -0.01781659573316574,
 -0.040776144713163376,
 -0.021051568910479546,
 0.05472518503665924,
 -0.005207449663430452,
 0.033254701644182205,
 0.04520756006240845,
 0.02684164047241211,
 -0.026571132242679596,
 0.012118871323764324,
 -0.06146808713674545,
 0.018111862242221832,
 -0.006144785322248936,
 0.011538649909198284,
 -0.023424915969371796,
 -0.02854336053133011,
 0.04319131746888161,
 0.05143482983112335,
 0.034985948354005814,
 0.02941146306693554,
 

Step 2: Use `pgvector` to find similar items


- The new `pgvector` similarity search operators provide powerful semantics
to combine the vector search operation with regular query filters in a single SQL query.
- **Using pgvector, you can now seamlessly integrate the power of relational databases with your vector search operations!**

In [153]:
from pgvector.asyncpg import register_vector
import asyncpg
from IPython.display import display, HTML

matches = []


async def main():
    # Create connection to SQL database.
    conn = await asyncpg.connect(database='test')

    await register_vector(conn)
    similarity_threshold = 0.3
    num_matches = 5

    # Find similar items to the query using cosine similarity search
    # over all vector embeddings. This new feature is provided by `pgvector`.
    results = await conn.fetch(
        """
                        WITH vector_matches AS (
                          SELECT test_id, 1 - (embedding <=> $1) AS similarity
                          FROM test_embeddings
                          WHERE 1 - (embedding <=> $1) > $2
                          ORDER BY similarity DESC
                          LIMIT $3
                        )
                        SELECT title, author, description, progress, content FROM test
                        WHERE id IN (SELECT test_id FROM vector_matches)
                        AND progress >= $4 AND progress <= $5
                        """,
        qe,
        similarity_threshold,
        num_matches,
        min_progress,
        max_progress
    )

    if len(results) == 0:
        raise Exception("Did not find any results. Adjust the query parameters.")

    for r in results:
        # Collect the description for all the matched similar items
        matches.append(
            f"""Title: {r["title"]}.
                Author: {r["author"]}.
                Description: {r["description"]}.
                Reading progress: {round(r['progress'])}.
                Content: {display(HTML(r["content"]))}."""
        )
        
    await conn.close()


# Run the SQL commands now.
await main()  # type: ignore

# Show the results for similar products that matched the user query.
matches

['Title: 👑 Axios Sports: Still king.\n                Author: Kendall Baker.\n                Description: Binge on the stats & stories that drive the sports world, by sports editor Kendall Baker.\nDaily - Weekdays.\n                Reading progress: 0.\n                Content: None.',
 'Title: 🔥 Axios Sports: Pickleball scoop.\n                Author: Kendall Baker.\n                Description: Binge on the stats & stories that drive the sports world, by sports editor Kendall Baker.\nDaily - Weekdays.\n                Reading progress: 0.\n                Content: None.',
 'Title: Axios Sports.\n                Author: Kendall Baker.\n                Description: Binge on the stats & stories that drive the sports world, by sports editor Kendall Baker.\nDaily - Weekdays.\n                Reading progress: 0.\n                Content: None.',
 "Title: ⚽️ Axios Sports: Streaming soccer.\n                Author: Jeff Tracy.\n                Description: 👋 Happy Friday! Pacers center Myl