1. pick some text documents
2. pick three embedding models (all three embedding models require less than 1GB of memory, and are small enough to run locally)
3. put text and embeddings generated by each model into pixeltable

--- query time ---

4. create embeddings of text
5. get top 3 results from each embedding model
6. rank results based on smallest sum of ranking

# Step 1: Import libraries

In [None]:
# ! pip install pixeltable tiktoken sentence-transformers

In [19]:
import pixeltable as pxt
import os
from pixeltable.functions.huggingface import sentence_transformer
import numpy as np

# Step 2: Define Embedding Functions

In [11]:
@pxt.expr_udf
def gist_embed(text: str) -> np.ndarray:
    return sentence_transformer(text, model_id='avsolatorio/GIST-Embedding-v0') #768 model from Aivin Solatorio

In [12]:
@pxt.expr_udf
def bge_base_embed(text: str) -> np.ndarray:
    return sentence_transformer(text, model_id='BAAI/bge-base-en-v1.5') #768 model from BAAI

In [90]:
@pxt.expr_udf
def e5_embed(text: str) -> np.ndarray:
    return sentence_transformer(text, model_id='intfloat/e5-large-v2')

# Step 3: Create Directory/Tables

In [23]:
pxt.drop_dir('reranker_demo', force=True)
pxt.create_dir('reranker_demo')

Created directory `reranker_demo`.


<pixeltable.catalog.dir.Dir at 0x32675c320>

In [71]:
pxt.drop_table('html_docs')

In [72]:
html_docs = pxt.create_table('html_docs', {'document': pxt.DocumentType()})

Created table `html_docs`.


# Step 4: Import Documents

In [68]:
base_url = "https://en.wikipedia.org/wiki/"

In [69]:
cities = [
    "Boston",
    "Seattle",
    "San_Francisco",
    "New_York_City"
]

In [73]:
city_urls = [base_url + cities[x] for x in range(len(cities))]

In [74]:
city_urls

['https://en.wikipedia.org/wiki/Boston',
 'https://en.wikipedia.org/wiki/Seattle',
 'https://en.wikipedia.org/wiki/San_Francisco',
 'https://en.wikipedia.org/wiki/New_York_City']

In [75]:
html_docs.insert(
    {'document': url} for url in city_urls
)

Inserting rows into `html_docs`: 4 rows [00:00, 1846.69 rows/s]
Inserted 4 rows with 0 errors.


UpdateStatus(num_rows=4, num_computed_values=0, num_excs=0, updated_cols=[], cols_with_excs=[])

# Step 5: Chunk Documents

In [77]:
from pixeltable.iterators import DocumentSplitter
chunked_docs = pxt.create_view(
    "chunked_docs",
    html_docs,
    iterator=DocumentSplitter.create(
        document=html_docs.document,
        separators='paragraph'
    )
)

Inserting rows into `chunked_docs`: 766 rows [00:00, 26178.30 rows/s]
Created view `chunked_docs` with 766 rows, 0 exceptions.


In [81]:
chunked_docs.describe()

Column Name,Type,Computed With
pos,Required[Int],
text,Required[String],
document,Required[Document],


In [104]:
chunked_docs["e5"] = e5_embed(chunked_docs.text)

Computing cells:   0%|                                                  | 0/766 [00:00<?, ? cells/s]

Computing cells: 100%|████████████████████████████████████████| 766/766 [02:04<00:00,  6.14 cells/s]
Added 766 column values with 0 errors.


In [92]:
chunked_docs["bge_base"] = bge_base_embed(chunked_docs.text)

Computing cells: 100%|████████████████████████████████████████| 766/766 [00:45<00:00, 17.00 cells/s]
Added 766 column values with 0 errors.


In [93]:
chunked_docs["gist"] = gist_embed(chunked_docs.text)

Computing cells: 100%|████████████████████████████████████████| 766/766 [00:44<00:00, 17.05 cells/s]
Added 766 column values with 0 errors.


In [98]:
chunked_docs.describe()

Column Name,Type,Computed With
pos,Required[Int],
text,Required[String],
e5,"Required[Array[(1024,), Float]]",e5_embed(text)
bge_base,"Required[Array[(768,), Float]]",bge_base_embed(text)
gist,"Required[Array[(768,), Float]]",gist_embed(text)
document,Required[Document],


# Step 6: Create Embedding Indexes

In [122]:
chunked_docs.add_embedding_index(col_name="text", idx_name="gist", string_embed=gist_embed)

Computing cells: 100%|████████████████████████████████████████| 766/766 [00:46<00:00, 16.36 cells/s]


In [110]:
chunked_docs.add_embedding_index(col_name="text", idx_name="bge", string_embed=bge_base_embed)

Computing cells: 100%|████████████████████████████████████████| 766/766 [00:47<00:00, 16.01 cells/s]


In [123]:
chunked_docs.add_embedding_index(col_name="text", idx_name="e5", string_embed=e5_embed)

Computing cells: 100%|████████████████████████████████████████| 766/766 [02:17<00:00,  5.57 cells/s]


# Step 7: Get Similarities

In [132]:
def top_k(query_text: str, idx_name: str):
    sim = chunked_docs.text.similarity(query_text, idx=idx_name)
    return chunked_docs.order_by(sim, asc=False).select(chunked_docs.text, sim=sim).limit(5).collect()

In [136]:
e5_sims = top_k("Population of Boston", idx_name="e5")
e5_sims

text,sim
Boston - Wikipedia Jump to content Search Search,0.853
"In 1822, [ 16 ] the citizens of Boston voted to change the official name from the ""Town of Boston"" to the ""City of Boston"", and on March 19, 1822, the people of Boston accepted the charter incorporating the city. [ 69 ] At the time Boston was chartered as a city, the population was about 46,226, while the area of the city was only 4.8 sq mi (12 km 2 ). [ 69 ]",0.848
"Boston ( US : / ˈ b ɔː s t ə n / ⓘ [ 9 ] ) is the capital and most populous city in the Commonwealth of Massachusetts in the United States . The city serves as the cultural and financial center of the New England region of the Northeastern United States . It has an area of 48.4 sq mi (125 km 2 ) [ 10 ] and a population of 675,647 as of the 2020 census , making it the third-largest city in the Northeast after New York City and Philadelphia . [ 4 ] The larger Greater Boston metropolitan statistical area , which includes and surrounds the city, has a population of 4,919,179 as of 2023, making it the largest in New England and eleventh-largest in the country. [ 11 ] [ 12 ] [ 13 ]",0.84
"State capital in New England, United States Boston State capital Downtown from Boston Harbor Acorn Street on Beacon Hill Old State House Massachusetts State House Fenway Park during a Boston Red Sox game Back Bay from the Charles River Flag Seal Coat of arms Wordmark Nickname(s): Bean Town, Title Town, others Motto(s): Sicut patribus sit Deus nobis ( Latin ) 'As God was with our fathers, so may He be with us' Show Boston Show Suffolk County Show Massachusetts Show the United States Boston Sh ...... mi (1,021.8/km 2 ) • Metro [ 6 ] 4,941,632 (US: 10th ) Demonym Bostonian GDP [ 7 ] • Boston (MSA) \$571.6 billion (2022) Time zone UTC−5 ( EST ) • Summer ( DST ) UTC−4 ( EDT ) ZIP Codes 53 ZIP Codes [ 8 ] 02108–02137, 02163, 02196, 02199, 02201, 02203–02206, 02210–02212, 02215, 02217, 02222, 02126, 02228, 02241, 02266, 02283–02284, 02293, 02295, 02297–02298, 02467 (also includes parts of Newton and Brookline) Area codes 617 and 857 FIPS code 25-07000 GNIS feature ID 617565 Website boston .gov",0.837
"In 2020, Boston was estimated to have 691,531 residents living in 266,724 households [ 4 ] —a 12% population increase over 2010. The city is the third-most densely populated large U.S. city of over half a million residents, and the most densely populated state capital. Some 1.2 million persons may be within Boston's boundaries during work hours, and as many as 2 million during special events. This fluctuation of people is caused by hundreds of thousands of suburban residents who travel to the city for work, education, health care, and special events. [ 162 ]",0.833


In [137]:
gist_sims = top_k("Population of Boston", idx_name="gist")
gist_sims

text,sim
"In 2020, Boston was estimated to have 691,531 residents living in 266,724 households [ 4 ] —a 12% population increase over 2010. The city is the third-most densely populated large U.S. city of over half a million residents, and the most densely populated state capital. Some 1.2 million persons may be within Boston's boundaries during work hours, and as many as 2 million during special events. This fluctuation of people is caused by hundreds of thousands of suburban residents who travel to the city for work, education, health care, and special events. [ 162 ]",0.879
"Boston ( US : / ˈ b ɔː s t ə n / ⓘ [ 9 ] ) is the capital and most populous city in the Commonwealth of Massachusetts in the United States . The city serves as the cultural and financial center of the New England region of the Northeastern United States . It has an area of 48.4 sq mi (125 km 2 ) [ 10 ] and a population of 675,647 as of the 2020 census , making it the third-largest city in the Northeast after New York City and Philadelphia . [ 4 ] The larger Greater Boston metropolitan statistical area , which includes and surrounds the city, has a population of 4,919,179 as of 2023, making it the largest in New England and eleventh-largest in the country. [ 11 ] [ 12 ] [ 13 ]",0.874
"Demographics [ edit ] See also: History of the Irish in Boston , History of Italian Americans in Boston , History of African Americans in Boston , Chinese Americans in Boston , Dominican-Americans in Boston , Vietnamese in Boston , and LGBT culture in Boston Historical population Year Pop. ±% 1680 4,500 — 1690 7,000 +55.6% 1700 6,700 −4.3% 1710 9,000 +34.3% 1722 10,567 +17.4% 1742 16,382 +55.0% 1765 15,520 −5.3% 1790 18,320 +18.0% 1800 24,937 +36.1% 1810 33,787 +35.5% 1820 43,298 +28.1% 1830 ...... 97,197 −13.0% 1970 641,071 −8.1% 1980 562,994 −12.2% 1990 574,283 +2.0% 2000 589,141 +2.6% 2010 617,594 +4.8% 2020 675,647 +9.4% 2022* 650,706 −3.7% *=population estimate. Source: United States census records and Population Estimates Program data. [ 148 ] [ 149 ] [ 150 ] [ 151 ] [ 152 ] [ 153 ] [ 154 ] [ 155 ] [ 156 ] [ 157 ] [ 158 ] [ 159 ] [ 160 ] 2010–2020 [ 4 ] Source: U.S. Decennial Census [ 161 ] Packed circles diagram showing estimates of the ethnic origins of people in Boston in 2021",0.849
"In 1822, [ 16 ] the citizens of Boston voted to change the official name from the ""Town of Boston"" to the ""City of Boston"", and on March 19, 1822, the people of Boston accepted the charter incorporating the city. [ 69 ] At the time Boston was chartered as a city, the population was about 46,226, while the area of the city was only 4.8 sq mi (12 km 2 ). [ 69 ]",0.816
Environment [ edit ] Population density and elevation above sea level in Greater Boston as of 2010,0.812


In [135]:
bge_sims = top_k("Population of Boston", idx_name="bge")
bge_sims

text,sim
"In 2020, Boston was estimated to have 691,531 residents living in 266,724 households [ 4 ] —a 12% population increase over 2010. The city is the third-most densely populated large U.S. city of over half a million residents, and the most densely populated state capital. Some 1.2 million persons may be within Boston's boundaries during work hours, and as many as 2 million during special events. This fluctuation of people is caused by hundreds of thousands of suburban residents who travel to the city for work, education, health care, and special events. [ 162 ]",0.79
"Boston ( US : / ˈ b ɔː s t ə n / ⓘ [ 9 ] ) is the capital and most populous city in the Commonwealth of Massachusetts in the United States . The city serves as the cultural and financial center of the New England region of the Northeastern United States . It has an area of 48.4 sq mi (125 km 2 ) [ 10 ] and a population of 675,647 as of the 2020 census , making it the third-largest city in the Northeast after New York City and Philadelphia . [ 4 ] The larger Greater Boston metropolitan statistical area , which includes and surrounds the city, has a population of 4,919,179 as of 2023, making it the largest in New England and eleventh-largest in the country. [ 11 ] [ 12 ] [ 13 ]",0.769
"Demographics [ edit ] See also: History of the Irish in Boston , History of Italian Americans in Boston , History of African Americans in Boston , Chinese Americans in Boston , Dominican-Americans in Boston , Vietnamese in Boston , and LGBT culture in Boston Historical population Year Pop. ±% 1680 4,500 — 1690 7,000 +55.6% 1700 6,700 −4.3% 1710 9,000 +34.3% 1722 10,567 +17.4% 1742 16,382 +55.0% 1765 15,520 −5.3% 1790 18,320 +18.0% 1800 24,937 +36.1% 1810 33,787 +35.5% 1820 43,298 +28.1% 1830 ...... 97,197 −13.0% 1970 641,071 −8.1% 1980 562,994 −12.2% 1990 574,283 +2.0% 2000 589,141 +2.6% 2010 617,594 +4.8% 2020 675,647 +9.4% 2022* 650,706 −3.7% *=population estimate. Source: United States census records and Population Estimates Program data. [ 148 ] [ 149 ] [ 150 ] [ 151 ] [ 152 ] [ 153 ] [ 154 ] [ 155 ] [ 156 ] [ 157 ] [ 158 ] [ 159 ] [ 160 ] 2010–2020 [ 4 ] Source: U.S. Decennial Census [ 161 ] Packed circles diagram showing estimates of the ethnic origins of people in Boston in 2021",0.745
"The Boston metro area contained a Jewish population of approximately 248,000 as of 2015. [ 186 ] More than half the Jewish households in the Greater Boston area reside in the city itself, Brookline , Newton , Cambridge , Somerville , or adjacent towns. [ 186 ] A small minority practices Confucianism , and some practice Boston Confucianism , an American evolution of Confucianism adapted for Boston intellectuals. [ 187 ]",0.732
"In 1822, [ 16 ] the citizens of Boston voted to change the official name from the ""Town of Boston"" to the ""City of Boston"", and on March 19, 1822, the people of Boston accepted the charter incorporating the city. [ 69 ] At the time Boston was chartered as a city, the population was about 46,226, while the area of the city was only 4.8 sq mi (12 km 2 ). [ 69 ]",0.728


# Step 8: Rerank

In [159]:
reranking_dict = {}

In [160]:
for index, text in enumerate(bge_sims["text"]):
    if text not in reranking_dict:
        reranking_dict[text] = bge_sims["sim"][index]
    elif text in reranking_dict:
        reranking_dict[text] += bge_sims["sim"][index]

In [162]:
for index, text in enumerate(e5_sims["text"]):
    if text not in reranking_dict:
        reranking_dict[text] = bge_sims["sim"][index]
    elif text in reranking_dict:
        reranking_dict[text] += bge_sims["sim"][index]

In [163]:
for index, text in enumerate(gist_sims["text"]):
    if text not in reranking_dict:
        reranking_dict[text] = bge_sims["sim"][index]
    elif text in reranking_dict:
        reranking_dict[text] += bge_sims["sim"][index]

In [164]:
reranking_dict

{"In 2020, Boston was estimated to have 691,531 residents living in 266,724 households [ 4 ] —a 12% population increase over 2010. The city is the third-most densely populated large U.S. city of over half a million residents, and the most densely populated state capital. Some 1.2\xa0million persons may be within Boston's boundaries during work hours, and as many as 2\xa0million during special events. This fluctuation of people is caused by hundreds of thousands of suburban residents who travel to the city for work, education, health care, and special events. [ 162 ]": 2.307025416367836,
 'Boston ( US : / ˈ b ɔː s t ə n / ⓘ [ 9 ] ) is the capital and most populous city in the Commonwealth of Massachusetts in the United States . The city serves as the cultural and financial center of the New England region of the Northeastern United States . It has an area of 48.4\xa0sq\xa0mi (125\xa0km 2 ) [ 10 ] and a population of 675,647 as of the 2020 census , making it the third-largest city in the

In [166]:
reranked = [(text, total_sim) for text, total_sim in reranking_dict.items()]
reranked.sort()
print(reranked[0])

('Boston ( US : / ˈ b ɔː s t ə n / ⓘ [ 9 ] ) is the capital and most populous city in the Commonwealth of Massachusetts in the United States . The city serves as the cultural and financial center of the New England region of the Northeastern United States . It has an area of 48.4\xa0sq\xa0mi (125\xa0km 2 ) [ 10 ] and a population of 675,647 as of the 2020 census , making it the third-largest city in the Northeast after New York City and Philadelphia . [ 4 ] The larger Greater Boston metropolitan statistical area , which includes and surrounds the city, has a population of 4,919,179 as of 2023, making it the largest in New England and eleventh-largest in the country. [ 11 ] [ 12 ] [ 13 ]', 2.2837965213607783)


In [59]:
pxt.drop_table('documents')

In [60]:
# docs = pxt.create_table('documents', {'document': pxt.DocumentType(nullable=True),
#                                       'document_name': pxt.StringType(nullable=True)})

docs = pxt.create_table('documents', {'document': pxt.DocumentType()})

Created table `documents`.


In [61]:
docs.describe()

Column Name,Type,Computed With
document,Required[Document],


In [62]:
ten_ks = os.listdir("./10k")
ten_ks

['uber_2021.pdf', 'lyft_2021.pdf']

In [63]:
for doc in ten_ks:
    docs.insert(
        document=f"./10k/{doc}"
    )

Inserting rows into `documents`: 1 rows [00:00, 706.59 rows/s]
Inserted 1 row with 0 errors.
Inserting rows into `documents`: 1 rows [00:00, 846.65 rows/s]
Inserted 1 row with 0 errors.


In [42]:
docs.describe()

Column Name,Type,Computed With
document,String,


In [64]:
from pixeltable.iterators import DocumentSplitter
chunked_docs = pxt.create_view(
    "chunked_docs",
    docs,
    iterator=DocumentSplitter.create(
        document=docs.document,
        separators='token_limit',
        limit=300
    )
)

FileNotFoundError: [Errno 2] No such file or directory: '/10k/uber_2021.pdf'

In [97]:
chunked_docs.drop_column("jxm_cde_small_v1")

In [96]:
chunked_docs["jxm_cde_small_v1"] = jxm_cde_small_v1_embed(chunked_docs.text)

Computing cells:   0%|                                                  | 0/766 [00:00<?, ? cells/s]


ModuleNotFoundError: No module named 'sentence_transformers_impl'

In [89]:
@pxt.expr_udf
def jxm_cde_small_v1_embed(text: str) -> np.ndarray:
    return sentence_transformer(text, model_id='jxm/cde-small-v1') #768 model from Jack Morris