# Qdrant Vector Database Creation

In [2]:
import jsonlines
import os
import requests
import typing as t

from getpass import getpass
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.schema import Document
from langchain_qdrant import Qdrant
from llama_cpp import Llama
import pandas as pd
from tqdm.notebook import tqdm
from qdrant_client import QdrantClient

## Download Data 

In [3]:
from ssec_tutorials import ASTROPH_ARXIV_ABSTRACTS, ASTROPY_GITHUB, download_astroph_arxiv_abstracts, download_astropy_github_documents, fetch_and_process_github_rst_files, download_qdrant_data, QDRANT_COLLECTION_NAME, QDRANT_PATH
from ssec_tutorials.scipy_conf import load_docs_from_jsonl

In [4]:
download_astropy_github_documents()

PosixPath('/Users/a42/.cache/ssec_tutorials/astropy-github.jsonl')

In [5]:
assert os.path.exists(ASTROPY_GITHUB)

In [6]:
download_astroph_arxiv_abstracts()

astro-ph arXiv abstracts already exist at /Users/a42/.cache/ssec_tutorials/astro-ph-arXiv-abstracts.pkl


PosixPath('/Users/a42/.cache/ssec_tutorials/astro-ph-arXiv-abstracts.pkl')

In [7]:
assert os.path.exists(ASTROPH_ARXIV_ABSTRACTS)

In [25]:
# Download qdrant vector database
download_qdrant_data()

PosixPath('/Users/a42/.cache/ssec_tutorials/scipy_qdrant_100k')

## GitHub Documents

In [6]:
# Enter your GitHub Personal Access Token securely
ACCESS_TOKEN = getpass(prompt="GitHub Personal Access Token: ")

GitHub Personal Access Token:  ········


In [9]:
# Usage example
repository = 'astropy/astropy'
branch = 'main'
docs_path = 'docs'

In [10]:
# Load Astropy docs from ASTROPY_GITHUB if already downloaded
if not os.path.exists(ASTROPY_GITHUB):
    github_documents = fetch_and_process_github_rst_files(
        github_repo=repository, 
        github_branch=branch, 
        github_documents=docs_path,
        github_personal_access_token=ACCESS_TOKEN
    )
else:
    github_documents = load_docs_from_jsonl(ASTROPY_GITHUB)

In [11]:
len(github_documents)

311

## Arxiv Abstracts

In [12]:
# We will use the already pickled file but refer to the notebook in the Appendix if you are interested in understanding how we built it
astro_df = pd.read_pickle(ASTROPH_ARXIV_ABSTRACTS)

In [13]:
print("Number of astrophysics papers: ", len(astro_df))

Number of astrophysics papers:  338658


In [14]:
astro_df.head()

Unnamed: 0,id,title,abstract
0,704.0009,"The Spitzer c2d Survey of Large, Nearby, Inste...",We discuss the results from the combined IRA...
1,704.0017,Spectroscopic Observations of the Intermediate...,Results from spectroscopic observations of t...
2,704.0023,ALMA as the ideal probe of the solar chromosphere,"The very nature of the solar chromosphere, i..."
3,704.0044,Astrophysical gyrokinetics: kinetic and fluid ...,We present a theoretical framework for plasm...
4,704.0048,Inference on white dwarf binary systems using ...,We report on the analysis of selected single...


In [15]:
# Randomly sample n rows to reduce the eventual vector database size.
astro_df = astro_df.sample(n=100000)

### Documents Loader

LangChain helps load different documents (.txt, .pdf, .docx, .csv, .xlsx, .json) to feed into the LLM. The Document Loader even allows YouTube audio parsing and loading as part of unstructured document loading.

Once loaded into the LangChain, the document can be pre-processed in different ways as required in the LLM application.  

In [16]:
from langchain_community.document_loaders import DataFrameLoader

In [17]:
# Load the dataframe full of abstracts
# to memory in the form of LangChain Document objects
loader = DataFrameLoader(astro_df, page_content_column="abstract") 
astrophysics_abstracts_documents = loader.load()

In [18]:
print("Number of astrophysics papers: ", len(astrophysics_abstracts_documents))

Number of astrophysics papers:  100000


In [19]:
all_documents = astrophysics_abstracts_documents + github_documents
print("Total Number of Documents: ", len(all_documents))

Total Number of Documents:  100311


## Qdrant Creation

In [20]:
QDRANT_PATH

PosixPath('/Users/a42/.cache/ssec_tutorials/scipy_qdrant')

In [21]:
QDRANT_COLLECTION_NAME

'arxiv_astro-ph_abstracts_astropy_github_documentation'

In [22]:
model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L12-v2")

  from tqdm.autonotebook import tqdm, trange


In [26]:
if QDRANT_PATH.exists():
    print(f"Qdrant Vector Database Collection already exists in {QDRANT_PATH}, load it")
    client = QdrantClient(path=str(QDRANT_PATH))
    qdrant = Qdrant(
        client=client,
        collection_name=QDRANT_COLLECTION_NAME,
        embeddings=model
    )
else:
    print(f"Creating new Qdrant collection '{QDRANT_COLLECTION_NAME}' from {len(all_documents)} documents")
    
    # Load the documents into a Qdrant Vector Database Collection
    # this will save locally in the qdrant_path as sqlite
    qdrant = Qdrant.from_documents(
        documents=all_documents,
        embedding=model,
        path=str(QDRANT_PATH),
        collection_name=QDRANT_COLLECTION_NAME,
    )

Qdrant Vector Database Collection already exists in /Users/a42/.cache/ssec_tutorials/scipy_qdrant, load it


In [27]:
# Number of documents in qdrant
qdrant.client.count(collection_name=QDRANT_COLLECTION_NAME)

CountResult(count=100311)

In [28]:
# Setup the retriever for later step
retriever = qdrant.as_retriever(search_type="mmr", search_kwargs={"k": 2})

In [29]:
retriever.invoke("What is dark matter?")

[Document(page_content='  One of the great scientific enigmas still unsolved, the existence of dark\nmatter, is reviewed. Simple gravitational arguments imply that most of the mass\nin the Universe, at least 90%, is some (unknown) non-luminous matter. Some\nparticle candidates for dark matter are discussed with particular emphasis on\nthe neutralino, a particle predicted by the supersymmetric extension of the\nStandard Model of particle physics. Experiments searching for these relic\nparticles, carried out by many groups around the world, are also discussed.\nThese experiments are becoming more sensitive every year and in fact one of the\ncollaborations claims that the first direct evidence for dark matter has\nalready been observed.\n', metadata={'id': 'hep-ph/0110122', 'title': 'The Enigma of the Dark Matter', '_id': '4ab99f7c922747d9a6a34b855d959779', '_collection_name': 'arxiv_astro-ph_abstracts_astropy_github_documentation'}),
 Document(page_content='  Dark matter could be compose

In [30]:
retriever.invoke("How can I perform celestial coordinate transformations?")

 Document(page_content="  Gorski et al (1999b) have earlier presented the outline of a\npixelisation-to-spherical-coordinate transformation scheme which simultaneously\nsatisfies three properties which are especially useful for rapid analyses of\nmaps on a sphere: (i) equal spacing of pixels along lines of constant latitude,\n(ii) equal pixel `areas' (solid angles) and (iii) hierarchical scaling with\nincreasing numbers of pixels. Their outline is based on the division of the\nsphere into twelve regions covering equal solid angles, which are\nhierarchically subdivided in a way compatible with these three criteria. In\nthis paper, a complete derivation of this scheme is presented, including, in\nparticular, (1) the angle theta^* defining the limit between polar and\nequatorial regions, and (2) the transformations from the unit interval [0,1]\n\\wedge [0,1] to spherical coordinates in a polar region.\n", metadata={'id': 'astro-ph/0409533', 'title': 'A Solution to the Isolatitude, Equi-ar

In [35]:
# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [36]:
print(format_docs(retriever.invoke("How can I perform celestial coordinate transformations?")))

.. _astropy-coordinates-transforming:

Transforming between Systems
****************************

`astropy.coordinates` supports a rich system for transforming
coordinates from one frame to another. While common astronomy frames
are built into Astropy, the transformation infrastructure is dynamic.
This means it allows users to define new coordinate frames and their
transformations. The topic of writing your own coordinate frame or
transforms is detailed in :ref:`astropy-coordinates-design`, and this
section is focused on how to *use* transformations.

The full list of built-in coordinate frames, the included transformations,
and the frame names are shown as a (clickable) graph in the
`~astropy.coordinates` API documentation.

Examples
--------

..
  EXAMPLE START
  Transforming Coordinates to Another Frame

The recommended method of transformation is shown below::

    >>> import astropy.units as u
    >>> from astropy.coordinates import SkyCoord
    >>> gc = SkyCoord(l=0*u.degree, b=4