In [1]:
import json
import importlib
import Utils
importlib.reload(Utils)  # This will print "Utils.py is loaded" again

import chromadb
from chromadb.config import DEFAULT_TENANT, DEFAULT_DATABASE, Settings

Utils.py is loaded
Utils.py is loaded


In [2]:
import chromadb.utils.embedding_functions as embedding_functions

# Can use SentenceTransformer to get embeddings for sentences
# https://docs.trychroma.com/docs/embeddings/embedding-functions
# 3 as of 3/7/2025: "intfloat/multilingual-e5-large-instruct", but only 514 Max tokens!!!!
# https://huggingface.co/spaces/mteb/leaderboard
# https://huggingface.co/intfloat/multilingual-e5-large-instruct

sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="intfloat/multilingual-e5-large-instruct"
)

In [3]:
ret_list_dicts = []
# read real-links list from json
with open('data/2.pb_articles_dicts.json', 'r') as f:
    ret_list_dicts = json.load(f)


In [4]:
ret_list_dicts

[{'title': 'The Man Behind the Berms: Kyle Strait on Crafting Sea Otter’s Dual Slalom',
  'author': 'lifetimeseaotterclassic',
  'date': '03-07-2025',
  'text': 'Words: Dan Hughes I get bored easily…as a human and as a\xa0rider. — Kyle\xa0Strait This sentiment, expressed by the man behind the dual slalom courses at the Life Time Sea Otter Classic presented by Continental, Kyle Strait, goes a long way to explaining the continued excitement and staying power of the discipline at Laguna Seca. From young racer to master builder, Strait’s cycling career has taken him around the globe, but always with a soft spot for Sea Otter. Originally from Southern California, Strait first encountered Sea Otter through the pages of mountain bike magazines. Inspired by what he saw, he asked his father if they could attend and, in 2000, got his first taste of the scale of a Sea Otter. You come up the hill to the guard shack and can see the whole place, the XC folks, the pits, and the hill where the dual sl

In [5]:


client = chromadb.PersistentClient(
    path="db",
    settings=Settings(),
    tenant=DEFAULT_TENANT,
    database=DEFAULT_DATABASE,
)

In [6]:
collection_name = "Pinkbike_articles"
collection = client.get_or_create_collection(name=collection_name,embedding_function=sentence_transformer_ef)

In [7]:
# Extract texts and metadata from documents
texts = [doc["title"] + ': ' + doc["text"] for doc in ret_list_dicts]

# Convert Timestamp to simple date string (e.g., '2025-03-06')
metadatas = [{
    "title": doc["title"],
    "author": doc["author"],
    "url": doc["url"],
    "date": doc["date"]
} for doc in ret_list_dicts]

ids = [doc["sha256"] for doc in ret_list_dicts]

In [8]:
# Retrieve all documents to check for existing IDs (if any)
existing_docs = collection.get()  # Get all documents
print(existing_docs)
existing_ids = existing_docs['ids']  # Extract existing IDs

# Filter out documents that have already been added (duplicate IDs)
filtered_texts = []
filtered_metadatas = []
filtered_ids = []

for text, meta, doc_id in zip(texts, metadatas, ids):
    if doc_id not in existing_ids:  # If the ID is not in the existing set, add it
        filtered_texts.append(text)
        filtered_metadatas.append(meta)
        filtered_ids.append(doc_id)
    else:
        print(f"Document with ID '{doc_id}' already exists in the collection.")



{'ids': [], 'embeddings': None, 'documents': [], 'uris': None, 'data': None, 'metadatas': [], 'included': [<IncludeEnum.documents: 'documents'>, <IncludeEnum.metadatas: 'metadatas'>]}


In [9]:
# Add only non-duplicate documents to the collection
if filtered_texts:
    print(f"Adding {len(filtered_texts)} new documents to the collection.")
    collection.add(
        documents=filtered_texts,
        metadatas=filtered_metadatas,
        ids=filtered_ids  # Only add documents that are not duplicates
    )
else:
    print("No new documents to add (all duplicates).")

Adding 55 new documents to the collection.


In [14]:
# Verify by querying the collection
results = collection.query(query_texts=["2025 predictions for mountain biking"], n_results=4)
results

{'ids': [['e35667d997365bf628da540beffaf9fbeb6e720a0c22ea474c7e5f1e8a63857d',
   '1f2c008e061346eed4788d0c2072ec9e54d8b136fb13ab03b5b806db394059c0',
   'fe1dad48c8120bc64822deca1174b8e47794ae4ec1d404a92ea14b4cd841e202',
   '5e1ca4c8878b19f3a52e1628d74bbb27c46231b1a2cf08566ecd98b480bc6c41']],
 'embeddings': None,
 'documents': [["Dario DiGiulio's 2025 Predictions: Behold, the visions of the seer. More radial tires. Schwalbe beat the competition to the punch with their radial tire release, but I don't think things will settle with that first release. The concept has legs, and the ride quality the tires offer seems to impress everyone that has a chance to ride them. As far as I understand it, the concept isn’t under patent protection, and can be iterated with various thread counts and casing approaches beyond the basic orientation change. I for one am stoked to see what the rubber wizards do with the idea over the next year or two. More downhill bikes. I think we’re going to see a few bra

In [11]:
collection.get()

{'ids': ['12aa8e23c1717cbfc8367f03cb03b36c54e6a8cfc6b077888c581831e573ae75',
  '86f02bdd26b103ea960251517c06648284a5576118682103e5796bafc5e510d3',
  '43b4459c227df7cd07a22d9f15db871ddbf668ba38aad02194aee076e669faa6',
  '1534bbb221a6f51fb3339ca8656a774c090487817b210db2534d4a5956d8fca3',
  '4f9ad2417e6000120d4fbd8248dae7711823b41f6823b6c340715f76cfed571f',
  '162bd75dde542af2f67068b001c32acfb79b27dcbc647e9c4b3a0e3ff8f9508f',
  'b802da173246a2e0526d7810af25294389e40addb2e4b3566c4913296942535b',
  'a69a5859f31ce7ac149f65b65b7119767b511a4743dd9613c35bdcd0cf8154f5',
  '6ad0d6b960f1e9f46ca9517f6de705b065a681b8c598199fac4f2b6924e1005d',
  'e34b7e17cadf9f2297a33af3b41e6577731307b276d698a91f182bd32ce28a9e',
  '352ee3b23d9fdaeee3fe1e5458aa9c012707906c5b1ddd8a7032452826365114',
  'e20fd789bfe5b533c2e5c8d7978cd1ebe9bdcf735b71447651364d570845eb44',
  '9743ec2ffb19850d3420a1b84127d3dc845b9ffcca24022b6b0679e2449b9531',
  '1a789dabfe94e1ed46134cd5b0762677af86ec9656d61f2f9e77bd7c33d5eb64',
  'e272d3b6f3

In [12]:
len(collection.get()['ids'])

55

In [17]:
results['ids'][0]

['e35667d997365bf628da540beffaf9fbeb6e720a0c22ea474c7e5f1e8a63857d',
 '1f2c008e061346eed4788d0c2072ec9e54d8b136fb13ab03b5b806db394059c0',
 'fe1dad48c8120bc64822deca1174b8e47794ae4ec1d404a92ea14b4cd841e202',
 '5e1ca4c8878b19f3a52e1628d74bbb27c46231b1a2cf08566ecd98b480bc6c41']