# Qdrant Vector Database Creation

In [2]:
import os
import requests

from getpass import getpass
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
from langchain_community.vectorstores import Qdrant
from llama_cpp import Llama
import matplotlib.pyplot as plt
import pandas as pd
from tqdm.notebook import tqdm
from qdrant_client import QdrantClient

## GitHub Documents

In [17]:
# Enter your GitHub Personal Access Token securely
ACCESS_TOKEN=getpass(prompt="GitHub Personal Access Token: ")

GitHub Personal Access Token:  ········


In [18]:
def fetch_and_process_rst_files(repo, branch, path):
    """
    Recursively fetch and process RST files from a GitHub repository.
    """
    base_url = f"https://api.github.com/repos/{repo}/contents/{path}?ref={branch}"
    headers = {'Accept': 'application/vnd.github.v3+json'}
    response = requests.get(base_url, headers=headers)
    response.raise_for_status()  # This will raise an error for failed requests
    files = response.json()

    documents = []
    for file in files:
        if file['type'] == 'dir':  # This is a directory; recurse into it
            documents.extend(fetch_and_process_rst_files(repo, branch, file['path']))
        elif file['name'].endswith('.rst'):
            file_url = file['download_url']
            response = requests.get(file_url, headers={'Accept': 'application/vnd.github.v3.raw'})
            response.raise_for_status()
            title = file['name'].replace('.rst', '').replace('_', ' ').title()
            documents.append(Document(page_content=response.text, metadata={"title": title, "url": file_url}))

    return documents

In [19]:
# Usage example
repository = 'astropy/astropy'
branch = 'main'
docs_path = 'docs'

In [20]:
github_documents = fetch_and_process_rst_files(
    repo=repository, 
    branch=branch, 
    path=docs_path,
)

In [21]:
len(github_documents)

309

## Arxiv Abstracts

In [22]:
# We will use the already pickled file but refer to the notebook in the Appendix if you are interested in understanding how we built it
# TODO: Fix path
astro_df = pd.read_pickle("../../resources/data/astro-ph-arXiv-abstracts.pkl")

In [23]:
print("Number of astrophysics papers: ", len(astro_df))

Number of astrophysics papers:  331564


In [24]:
astro_df.head()

Unnamed: 0,id,title,abstract
0,712.2086,On weak and strong magnetohydrodynamic turbulence,Recent numerical and observational studies c...
1,712.2103,Hilltop Curvatons,We study ``hilltop'' curvatons that evolve o...
2,712.211,Near-field cosmology with the VLT,With the arrival of wide-field imagers on me...
3,712.2111,The prototype colliding-wind pinwheel WR 104,Results from the most extensive study of the...
4,712.2116,X-ray spectral evolution of TeV BL Lac objects...,Many of the extragalactic sources detected i...


### Documents Loader

LangChain helps load different documents (.txt, .pdf, .docx, .csv, .xlsx, .json) to feed into the LLM. The Document Loader even allows YouTube audio parsing and loading as part of unstructured document loading.

Once loaded into the LangChain, the document can be pre-processed in different ways as required in the LLM application.  

In [25]:
from langchain_community.document_loaders import DataFrameLoader

In [26]:
# Load the dataframe full of abstracts
# to memory in the form of LangChain Document objects
loader = DataFrameLoader(astro_df, page_content_column="abstract") 
astrophysics_abstracts_documents = loader.load()

In [27]:
print("Number of astrophysics papers: ", len(astrophysics_abstracts_documents))

Number of astrophysics papers:  331564


In [28]:
all_documents = astrophysics_abstracts_documents + github_documents
print("Total Number of Documents: ", len(all_documents))

Total Number of Documents:  331873


## Qdrant Creation

In [29]:
# TODO: Fix the path
qdrant_path = "../../resources/data/qdrant/scipy_qdrant/"
qdrant_collection = "arxiv_astro-ph_abstracts_astropy_github_documentation"

In [30]:
model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [31]:
print(f"Creating new Qdrant collection '{qdrant_collection}' from {len(all_documents)} documents")
    
# Load the documents into a Qdrant Vector Database Collection
# this will save locally in the qdrant_path as sqlite
qdrant = Qdrant.from_documents(
    documents=all_documents,
    embedding=model,
    path=qdrant_path,
    collection_name=qdrant_collection,
)

Creating new Qdrant collection 'arxiv_astro-ph_abstracts_astropy_github_documentation' from 331873 documents


In [32]:
# Setup the retriever for later step
retriever = qdrant.as_retriever(search_type="mmr", search_kwargs={"k": 2})

In [33]:
retriever.invoke("What is dark matter?")

[Document(page_content="  Dark matter is one of the greatest unsolved mysteries in cosmology at the\npresent time. About 80% of the universe's gravitating matter is non-luminous,\nand its nature and distribution are for the most part unknown. In this paper,\nwe will outline the history, astrophysical evidence, candidates, and detection\nmethods of dark matter, with the goal to give the reader an accessible but\nrigorous introduction to the puzzle of dark matter. This review targets\nadvanced students and researchers new to the field of dark matter, and includes\nan extensive list of references for further study.\n", metadata={'id': 1006.2483, 'title': 'Dark Matter: A Primer', '_id': 'e71cc3e253f9449e82ba6fd305323c68', '_collection_name': 'arxiv_astro-ph_abstracts_astropy_github_documentation'}),
 Document(page_content='  It is suggested that Dark Matter in the Universe is made of stars and black\nholes of WIMP matter.\n', metadata={'id': 'astro-ph/0204375', 'title': 'WIMP Stars as Dark

In [34]:
retriever.invoke("How can I perform celestial coordinate transformations?")

 Document(page_content="  AIMS: An alternative to the traditional method for modeling kinematics of the\nEarth's rotation is proposed. The purpose of developing the new approach is to\nprovide a self-consistent and simple description of the Earth's rotation in a\nway that can be estimated directly from observations without using intermediate\nquantities.\n  METHODS: Instead of estimating the time series of pole coordinates, the\nUT1--TAI angles, their rates, and the daily offsets of nutation, it is proposed\nto estimate coefficients of the expansion of a small perturbational rotation\nvector into basis functions. The resulting transformation from the terrestrial\ncoordinate system to the celestial coordinate system is formulated as a product\nof an a priori matrix of a finite rotation and an empirical vector of a\nresidual perturbational rotation. In the framework of this approach, the\nspecific choice of the a priori matrix is irrelevant, provided the angles of\nthe residual rotation 

In [35]:
# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [36]:
print(format_docs(retriever.invoke("How can I perform celestial coordinate transformations?")))

.. _astropy-coordinates-transforming:

Transforming between Systems
****************************

`astropy.coordinates` supports a rich system for transforming
coordinates from one frame to another. While common astronomy frames
are built into Astropy, the transformation infrastructure is dynamic.
This means it allows users to define new coordinate frames and their
transformations. The topic of writing your own coordinate frame or
transforms is detailed in :ref:`astropy-coordinates-design`, and this
section is focused on how to *use* transformations.

The full list of built-in coordinate frames, the included transformations,
and the frame names are shown as a (clickable) graph in the
`~astropy.coordinates` API documentation.

Examples
--------

..
  EXAMPLE START
  Transforming Coordinates to Another Frame

The recommended method of transformation is shown below::

    >>> import astropy.units as u
    >>> from astropy.coordinates import SkyCoord
    >>> gc = SkyCoord(l=0*u.degree, b=4