# Qdrant Vector Database Creation

In [1]:
import os
import requests

from getpass import getpass
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
from langchain_community.vectorstores import Qdrant
from llama_cpp import Llama
import matplotlib.pyplot as plt
import pandas as pd
from tqdm.notebook import tqdm
from qdrant_client import QdrantClient

## GitHub Documents

In [2]:
# Enter your GitHub Personal Access Token securely
ACCESS_TOKEN=getpass(prompt="GitHub Personal Access Token: ")

GitHub Personal Access Token:  ········


In [8]:
def fetch_and_process_rst_files(repo, branch, path):
    """
    Recursively fetch and process RST files from a GitHub repository.
    """
    base_url = f"https://api.github.com/repos/{repo}/contents/{path}?ref={branch}"
    headers = {'Accept': 'application/vnd.github.v3+json'}
    response = requests.get(base_url, headers=headers)
    response.raise_for_status()  # This will raise an error for failed requests
    files = response.json()

    documents = []
    for file in files:
        if file['type'] == 'dir':  # This is a directory; recurse into it
            documents.extend(fetch_and_process_rst_files(repo, branch, file['path']))
        elif file['name'].endswith('.rst'):
            file_url = file['download_url']
            response = requests.get(file_url, headers={'Accept': 'application/vnd.github.v3.raw'})
            response.raise_for_status()
            title = file['name'].replace('.rst', '').replace('_', ' ').title()
            documents.append(Document(page_content=response.text, metadata={"title": title, "url": file_url}))

    return documents

In [9]:
# Usage example
repository = 'astropy/astropy'
branch = 'main'
docs_path = 'docs'

In [10]:
github_documents = fetch_and_process_rst_files(
    repo=repository, 
    branch=branch, 
    path=docs_path,
)

HTTPError: 403 Client Error: rate limit exceeded for url: https://api.github.com/repos/astropy/astropy/contents/docs/io?ref=main

In [None]:
len(github_documents)

In [None]:
lengths = [len(doc.page_content) for doc in github_documents]

# Plot the distribution of document lengths, counted as the number of tokens
fig = pd.Series(lengths).hist()
plt.title("Distribution of document lengths in the knowledge base (in count of characters)")
plt.show()

> TODO: What happens if send a context to OLMo that's > 2048

## Arxiv Abstracts

In [None]:
# We will use the already pickled file but refer to the notebook in the Appendix if you are interested in understanding how we built it
astro_df = pd.read_pickle("../../resources/data/astro-ph-arXiv-abstracts.pkl")

In [None]:
print("Number of astrophysics papers: ", len(astro_df))

In [None]:
astro_df.head()

### Documents Loader

LangChain helps load different documents (.txt, .pdf, .docx, .csv, .xlsx, .json) to feed into the LLM. The Document Loader even allows YouTube audio parsing and loading as part of unstructured document loading.

Once loaded into the LangChain, the document can be pre-processed in different ways as required in the LLM application.  

In [None]:
from langchain_community.document_loaders import DataFrameLoader

In [None]:
# Load the dataframe full of abstracts
# to memory in the form of LangChain Document objects
loader = DataFrameLoader(astro_df, page_content_column="abstract") 
astrophysics_abstracts_documents = loader.load()

In [None]:
print("Number of astrophysics papers: ", len(astrophysics_abstracts_documents))

In [None]:
all_documents = astrophysics_abstracts_documents + github_documents
print("Total Number of Documents: ", len(all_documents))

## Qdrant Creation

In [None]:
# TODO: Fix the path
qdrant_path = "../../resources/data/qdrant/scipy_qdrant/"
qdrant_collection = "arxiv_astro-ph_abstracts_astropy_github_documentation"

In [None]:
model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [None]:
print(f"Creating new Qdrant collection '{qdrant_collection}' from {len(all_documents)} documents")
    
# Load the documents into a Qdrant Vector Database Collection
# this will save locally in the current directory as sqlite
qdrant = Qdrant.from_documents(
    documents=all_documents,
    embedding=model,
    path=qdrant_path,
    collection_name=qdrant_collection,
)

In [None]:
# Setup the retriever for later step
retriever = qdrant.as_retriever(search_type="mmr", search_kwargs={"k": 2})

In [None]:
retriever.invoke("What is dark matter?")

In [None]:
retriever.invoke("How can I perform celestial coordinate transformations?")

In [None]:
# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [None]:
print(format_docs(retriever.invoke("How can I perform celestial coordinate transformations?")))