In [11]:
import requests


In [28]:
response = requests.get("https://docs.langchain.com/llms.txt")
text = response.text
text_lines = text.splitlines()
text_lines

['# Docs by LangChain',
 '',
 '## Docs',
 '',
 '- [null](https://docs.langchain.com/index.md)',
 '- [Connect an authentication provider](https://docs.langchain.com/langsmith/add-auth-server.md)',
 '- [Human-in-the-loop using server API](https://docs.langchain.com/langsmith/add-human-in-the-loop.md)',
 '- [Add metadata and tags to traces](https://docs.langchain.com/langsmith/add-metadata-tags.md)',
 '- [Overview](https://docs.langchain.com/langsmith/administration-overview.md)',
 '- [Set up Agent Auth (Beta)](https://docs.langchain.com/langsmith/agent-auth.md): Enable secure access from agents to any system using OAuth 2.0 credentials with Agent Auth.',
 '- [Agent Builder](https://docs.langchain.com/langsmith/agent-builder.md)',
 '- [Agent Builder setup](https://docs.langchain.com/langsmith/agent-builder-setup.md): Add required workspace secrets for models and tools used by Agent Builder.',
 '- [Supported tools](https://docs.langchain.com/langsmith/agent-builder-tools.md)',
 '- [Alerts 

#### Extracting topics and urls

In [46]:
import re
topics = re.findall(r"\[([^\[\]]+)\]", text)
topics

['null',
 'Connect an authentication provider',
 'Human-in-the-loop using server API',
 'Add metadata and tags to traces',
 'Overview',
 'Set up Agent Auth (Beta)',
 'Agent Builder',
 'Agent Builder setup',
 'Supported tools',
 'Alerts in LangSmith',
 'Configure webhook notifications for LangSmith alerts',
 'Analyze an experiment',
 'Custom instrumentation',
 'Annotate traces and runs inline',
 'Use annotation queues',
 'Control plane API reference for LangSmith Deployment',
 'App development in LangSmith Deployment',
 'Application structure',
 'Assistants',
 'Log user feedback using the SDK',
 'How to audit evaluator scores',
 'Authentication & access control',
 'Authentication methods',
 'How to integrate LangGraph with AutoGen, CrewAI, and other frameworks',
 'How to kick off background runs',
 'Manage billing in your account',
 'Automatically run evaluators on experiments',
 'Implement a CI/CD pipeline using LangSmith Deployments and Evaluation',
 'LangGraph CLI',
 'Cloud',
 'How t

In [49]:
urls = re.findall(r'https?://[^\s\]]+',text)
urls

['https://docs.langchain.com/index.md)',
 'https://docs.langchain.com/langsmith/add-auth-server.md)',
 'https://docs.langchain.com/langsmith/add-human-in-the-loop.md)',
 'https://docs.langchain.com/langsmith/add-metadata-tags.md)',
 'https://docs.langchain.com/langsmith/administration-overview.md)',
 'https://docs.langchain.com/langsmith/agent-auth.md):',
 'https://docs.langchain.com/langsmith/agent-builder.md)',
 'https://docs.langchain.com/langsmith/agent-builder-setup.md):',
 'https://docs.langchain.com/langsmith/agent-builder-tools.md)',
 'https://docs.langchain.com/langsmith/alerts.md)',
 'https://docs.langchain.com/langsmith/alerts-webhook.md)',
 'https://docs.langchain.com/langsmith/analyze-an-experiment.md)',
 'https://docs.langchain.com/langsmith/annotate-code.md)',
 'https://docs.langchain.com/langsmith/annotate-traces-inline.md)',
 'https://docs.langchain.com/langsmith/annotation-queues.md)',
 'https://docs.langchain.com/langsmith/api-ref-control-plane.md)',
 'https://docs.l

In [66]:
from langchain_core.documents import Document
import uuid

document_structures=[]

for i,(topic,url) in enumerate(zip(topics,urls)):
    doc = Document(
        id=f"doc_{uuid.uuid4().hex[:8]}_{i}",
        page_content=topic,
        metadata={
            "url":url
        }
    )
    document_structures.append(doc)

In [69]:
document_structures[1]

Document(id='doc_44f8cf79_1', metadata={'url': 'https://docs.langchain.com/langsmith/add-auth-server.md)'}, page_content='Connect an authentication provider')

In [71]:
import chromadb

client = chromadb.Client()
collection = client.get_or_create_collection("sample")

for doc in document_structures:
    collection.add(
        ids=doc.id,
        documents=doc.page_content,
        metadatas=doc.metadata
    )

In [73]:
collection.count()

497

In [74]:
collection.peek()

{'ids': ['doc_46e6be2c_0',
  'doc_44f8cf79_1',
  'doc_3bbb09bf_2',
  'doc_41fbd9cc_3',
  'doc_985eea15_4',
  'doc_edf70025_5',
  'doc_70941cbb_6',
  'doc_9d71a9e4_7',
  'doc_243f6eff_8',
  'doc_0b911065_9'],
 'embeddings': array([[ 0.03034231,  0.09264877, -0.0748743 , ..., -0.04384101,
         -0.03221784,  0.00014146],
        [-0.08304433,  0.01736081, -0.00843294, ..., -0.03126846,
          0.10851631, -0.03250459],
        [-0.07311932,  0.03917663, -0.10074075, ...,  0.03521687,
         -0.00917505, -0.04572854],
        ...,
        [-0.01138952, -0.02653145, -0.08392476, ...,  0.03865344,
          0.02316016, -0.01675651],
        [-0.08117197, -0.04342891, -0.04244822, ..., -0.050323  ,
          0.1169309 ,  0.05921618],
        [-0.06853037,  0.01133358,  0.05638402, ...,  0.02734672,
         -0.00561612, -0.03183711]], shape=(10, 384)),
 'documents': ['null',
  'Connect an authentication provider',
  'Human-in-the-loop using server API',
  'Add metadata and tags to tra

In [79]:
collection.query(query_texts="Langsmith self hosting",n_results=3)

{'ids': [['doc_3e10feec_96', 'doc_0f7abd1b_194', 'doc_811fa6d9_191']],
 'embeddings': None,
 'documents': [['Hosting LangSmith',
   'Self-hosted LangSmith',
   'Interact with your self-hosted instance of LangSmith']],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[{'url': 'https://docs.langchain.com/langsmith/hosting.md)'},
   {'url': 'https://docs.langchain.com/langsmith/self-hosted.md)'},
   {'url': 'https://docs.langchain.com/langsmith/self-host-usage.md)'}]],
 'distances': [[0.15007463097572327,
   0.16184505820274353,
   0.32330355048179626]]}