# Qdrant Vector Database Creation

In [1]:
import jsonlines
import os
import requests
import typing as t

from getpass import getpass
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
from langchain_community.vectorstores import Qdrant
from llama_cpp import Llama
import matplotlib.pyplot as plt
import pandas as pd
from tqdm.notebook import tqdm
from qdrant_client import QdrantClient

## Download Data 

In [2]:
from ssec_tutorials import ASTROPH_ARXIV_ABSTRACTS, ASTROPY_GITHUB, download_astroph_arxiv_abstracts, download_astropy_github_documents, fetch_and_process_github_rst_files, download_qdrant_data, QDRANT_COLLECTION_NAME, QDRANT_PATH
from ssec_tutorials.scipy_conf import load_docs_from_jsonl

In [3]:
download_astropy_github_documents()

Astropy github files already exist at /Users/a42/.cache/ssec_tutorials/astropy-github.jsonl


PosixPath('/Users/a42/.cache/ssec_tutorials/astropy-github.jsonl')

In [4]:
assert os.path.exists(ASTROPY_GITHUB)

In [5]:
download_astroph_arxiv_abstracts()

PosixPath('/Users/a42/.cache/ssec_tutorials/astro-ph-arXiv-abstracts.pkl')

In [6]:
assert os.path.exists(ASTROPH_ARXIV_ABSTRACTS)

In [33]:
# Download qdrant vector database
download_qdrant_data()

PosixPath('/Users/a42/.cache/ssec_tutorials/scipy_qdrant')

## GitHub Documents

In [6]:
# Enter your GitHub Personal Access Token securely
ACCESS_TOKEN = getpass(prompt="GitHub Personal Access Token: ")

GitHub Personal Access Token:  ········


In [8]:
# Usage example
repository = 'astropy/astropy'
branch = 'main'
docs_path = 'docs'

In [7]:
# Load Astropy docs from ASTROPY_GITHUB if already downloaded
if not os.path.exists(ASTROPY_GITHUB):
    github_documents = fetch_and_process_github_rst_files(
        github_repo=repository, 
        github_branch=branch, 
        github_documents=docs_path,
        github_personal_access_token=ACCESS_TOKEN
    )
else:
    github_documents = load_docs_from_jsonl(ASTROPY_GITHUB)

In [12]:
len(github_documents)

311

## Arxiv Abstracts

In [14]:
# We will use the already pickled file but refer to the notebook in the Appendix if you are interested in understanding how we built it
astro_df = pd.read_pickle(ASTROPH_ARXIV_ABSTRACTS)

In [15]:
print("Number of astrophysics papers: ", len(astro_df))

Number of astrophysics papers:  338658


In [16]:
astro_df.head()

Unnamed: 0,id,title,abstract
0,704.0009,"The Spitzer c2d Survey of Large, Nearby, Inste...",We discuss the results from the combined IRA...
1,704.0017,Spectroscopic Observations of the Intermediate...,Results from spectroscopic observations of t...
2,704.0023,ALMA as the ideal probe of the solar chromosphere,"The very nature of the solar chromosphere, i..."
3,704.0044,Astrophysical gyrokinetics: kinetic and fluid ...,We present a theoretical framework for plasm...
4,704.0048,Inference on white dwarf binary systems using ...,We report on the analysis of selected single...


### Documents Loader

LangChain helps load different documents (.txt, .pdf, .docx, .csv, .xlsx, .json) to feed into the LLM. The Document Loader even allows YouTube audio parsing and loading as part of unstructured document loading.

Once loaded into the LangChain, the document can be pre-processed in different ways as required in the LLM application.  

In [17]:
from langchain_community.document_loaders import DataFrameLoader

In [18]:
# Load the dataframe full of abstracts
# to memory in the form of LangChain Document objects
loader = DataFrameLoader(astro_df, page_content_column="abstract") 
astrophysics_abstracts_documents = loader.load()

In [19]:
print("Number of astrophysics papers: ", len(astrophysics_abstracts_documents))

Number of astrophysics papers:  338658


In [20]:
all_documents = astrophysics_abstracts_documents + github_documents
print("Total Number of Documents: ", len(all_documents))

Total Number of Documents:  338969


## Qdrant Creation

In [30]:
QDRANT_PATH

PosixPath('/Users/a42/.cache/ssec_tutorials/scipy_qdrant')

In [31]:
QDRANT_COLLECTION_NAME

'arxiv_astro-ph_abstracts_astropy_github_documentation'

In [22]:
model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L12-v2")

In [34]:
if QDRANT_PATH.exists():
    print(f"If the Qdrant Vector Database Collection already exists in {QDRANT_PATH}, load it")
    client = QdrantClient(path=str(QDRANT_PATH))
    qdrant = Qdrant(
        client=client,
        collection_name=QDRANT_COLLECTION_NAME,
        embeddings=model
    )
else:
    print(f"Creating new Qdrant collection '{qdrant_collection}' from {len(all_documents)} documents")
    
    # Load the documents into a Qdrant Vector Database Collection
    # this will save locally in the qdrant_path as sqlite
    qdrant = Qdrant.from_documents(
        documents=all_documents,
        embedding=model,
        path=str(QDRANT_PATH),
        collection_name=QDRANT_COLLECTION_NAME,
    )

If the Qdrant Vector Database Collection already exists in /Users/a42/.cache/ssec_tutorials/scipy_qdrant, load it


In [35]:
# Number of documents in qdrant
qdrant.client.count(collection_name=qdrant_collection)

CountResult(count=338969)

In [36]:
# Setup the retriever for later step
retriever = qdrant.as_retriever(search_type="mmr", search_kwargs={"k": 2})

In [37]:
retriever.invoke("What is dark matter?")

[Document(page_content='  I give a review of the development of the concept of dark matter. The dark\nmatter story passed through several stages from a minor observational puzzle to\na major challenge for theory of elementary particles. Modern data suggest that\ndark matter is the dominant matter component in the Universe, and that it\nconsists of some unknown non-baryonic particles. Properties of dark matter\nparticles determine the structure of the cosmic web.\n', metadata={'id': 1109.558, 'title': 'Dark matter', '_id': '363091ccc8f643fa9b51eed9aa157ad9', '_collection_name': 'arxiv_astro-ph_abstracts_astropy_github_documentation'}),
 Document(page_content='  Even though there are strong astrophysical and cosmological indications to\nsupport the existence of dark matter, its exact nature remains unknown. We\nexpect dark matter to produce standard model particles when annihilating or\ndecaying, assuming that it is composed of Weakly Interacting Massive Particles\n(WIMPs). These standar

In [34]:
retriever.invoke("How can I perform celestial coordinate transformations?")

 Document(page_content="  AIMS: An alternative to the traditional method for modeling kinematics of the\nEarth's rotation is proposed. The purpose of developing the new approach is to\nprovide a self-consistent and simple description of the Earth's rotation in a\nway that can be estimated directly from observations without using intermediate\nquantities.\n  METHODS: Instead of estimating the time series of pole coordinates, the\nUT1--TAI angles, their rates, and the daily offsets of nutation, it is proposed\nto estimate coefficients of the expansion of a small perturbational rotation\nvector into basis functions. The resulting transformation from the terrestrial\ncoordinate system to the celestial coordinate system is formulated as a product\nof an a priori matrix of a finite rotation and an empirical vector of a\nresidual perturbational rotation. In the framework of this approach, the\nspecific choice of the a priori matrix is irrelevant, provided the angles of\nthe residual rotation 

In [35]:
# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [36]:
print(format_docs(retriever.invoke("How can I perform celestial coordinate transformations?")))

.. _astropy-coordinates-transforming:

Transforming between Systems
****************************

`astropy.coordinates` supports a rich system for transforming
coordinates from one frame to another. While common astronomy frames
are built into Astropy, the transformation infrastructure is dynamic.
This means it allows users to define new coordinate frames and their
transformations. The topic of writing your own coordinate frame or
transforms is detailed in :ref:`astropy-coordinates-design`, and this
section is focused on how to *use* transformations.

The full list of built-in coordinate frames, the included transformations,
and the frame names are shown as a (clickable) graph in the
`~astropy.coordinates` API documentation.

Examples
--------

..
  EXAMPLE START
  Transforming Coordinates to Another Frame

The recommended method of transformation is shown below::

    >>> import astropy.units as u
    >>> from astropy.coordinates import SkyCoord
    >>> gc = SkyCoord(l=0*u.degree, b=4