# Retrieval Augmented Generation - make your own local chat from github docs



## Intro


## Baking our final results


In [1]:
import pandas as pd

astro_ph_df = pd.read_pickle("resources/data/astro-ph-arXiv-abstracts.pkl")

In [2]:
from langchain import PromptTemplate
from llama_cpp import Llama

from langchain_community.llms import LlamaCpp
from langchain_core.callbacks import StreamingStdOutCallbackHandler

from ssec_usrse2024 import OLMO_MODEL

from langchain_community.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer, util

import os

from langchain_community.vectorstores import Qdrant
from langchain.embeddings import HuggingFaceEmbeddings

from qdrant_client import QdrantClient

olmo = LlamaCpp(
    model_path=str(OLMO_MODEL),
    temperature=0.8,
    verbose=False,  
)

prompt_template = PromptTemplate.from_template(
    template=olmo.client.metadata['tokenizer.chat_template'], 
    template_format="jinja2"
)



def ask_question(question):
    print(f"\n\n------------------------------------------\nQuestion: {question}")
    messages = [
        {
            "role": "user", 
            "content": f"""You are an astrophysics expert. Please answer the following question on astrophysics. 
            Question: {question}"""
        }
    ]
    return llm_chain.invoke(
        {
            "messages": messages, 
            "add_generation_prompt": True, 
            "eos_token": "<|endoftext|>",
        },
        config={
            'callbacks' : [StreamingStdOutCallbackHandler()]
        }
    )

model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# TODO: Fix module paths
qdrant_path = "resources/data/qdrant/scipy_qdrant/"

# TODO: Change collection name to 
qdrant_collection = "arxiv_astro-ph_abstracts"

if os.path.exists(qdrant_path):
    print(f"Loading existing Qdrant collection '{qdrant_collection}'")
    
    client = QdrantClient(path=qdrant_path)
    
    qdrant = Qdrant(
        client=client,
        collection_name=qdrant_collection,
        embeddings=model
    )

retriever = qdrant.as_retriever(search_type="mmr", search_kwargs={"k": 2})

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


  from tqdm.autonotebook import tqdm, trange


Loading existing Qdrant collection 'arxiv_astro-ph_abstracts'


In [3]:

question = "What is dark matter?"

context = format_docs(retriever.invoke(question))

prompt_template.format(
    messages=[
        {
            "role": "user", 
            "content": f"""You are an expert at astrophysics. Please answer the question on astrophysics based on the following context:

            Context: {context}
            
            Question: {question}"""
        }
    ], 
    add_generation_prompt=True, 
    eos_token="<|endoftext|>"
)

llm_chain = prompt_template | olmo

llm_chain.invoke(
    {
        "messages":
            [{
                "role": "user", 
                "content": f"""You are an expert at astrophysics. Please answer the question on astrophysics based on the following context:
    
                Context: {context}
                
                Question: {question}"""
            }
        ], 
        "add_generation_prompt": True, 
        "eos_token": "<|endoftext|>",
    },
    config={
        'callbacks' : [StreamingStdOutCallbackHandler()]
    }
)

 Dark matter is a theoretical particle that, according to contemporary astrophysical observations, makes up approximately 80% of the total mass in the universe (1). This mysterious substance has no illuminating properties, meaning it does not emit, reflect, or refract light. In other words, dark matter is invisible to our current technology and understanding.

While its properties are still largely unknown, scientists suggest that it may be made up of stars and black holes that have cooled down to the point where they do not emit light anymore (2). This idea, called the Warm Dark Matter (WDM) model, aims to provide a more comprehensive picture of dark matter's distribution and properties compared to the Cold Dark Matter (CDM) hypothesis.

Despite numerous theories and research in this area, understanding the nature and origin of dark matter remains one of the greatest unsolved mysteries in cosmology today. This review is aimed at providing an accessible yet rigorous introduction for ad

' Dark matter is a theoretical particle that, according to contemporary astrophysical observations, makes up approximately 80% of the total mass in the universe (1). This mysterious substance has no illuminating properties, meaning it does not emit, reflect, or refract light. In other words, dark matter is invisible to our current technology and understanding.\n\nWhile its properties are still largely unknown, scientists suggest that it may be made up of stars and black holes that have cooled down to the point where they do not emit light anymore (2). This idea, called the Warm Dark Matter (WDM) model, aims to provide a more comprehensive picture of dark matter\'s distribution and properties compared to the Cold Dark Matter (CDM) hypothesis.\n\nDespite numerous theories and research in this area, understanding the nature and origin of dark matter remains one of the greatest unsolved mysteries in cosmology today. This review is aimed at providing an accessible yet rigorous introduction 

In [4]:

question = "What is dark matter?"

prompt_template.format(
    messages=[
        {
            "role": "user", 
            "content": f"""You are an expert at astrophysics. Please answer the question on astrophysics. 
            Question: {question}"""
        }
    ], 
    add_generation_prompt=True, 
    eos_token="<|endoftext|>"
)

llm_chain.invoke(
    {
        "messages":
        [
            {
                "role": "user", 
                "content": f"""You are an expert at astrophysics. Please answer the question on astrophysics. 
                Question: {question}"""
            }
        ], 
        "add_generation_prompt": True, 
        "eos_token": "<|endoftext|>",
    },
    config={
        'callbacks' : [StreamingStdOutCallbackHandler()]
    }
)

Dark matter is a theoretical particle that exists within the framework of astronomical and cosmological physics, though it has not yet been directly detected through laboratory experiments or other means. Dark matter makes up approximately 85% of the matter in the universe, while visible matter (which includes stars, planets, gases, and dust) only accounts for about 5% of the total mass-energy density of the observable universe.

This discrepancy between visible matter and dark matter is known as the "missing mass" problem. Dark matter is believed to interact with ordinary matter through gravity only, and it does not emit or absorb light, making its detection extremely challenging. Researchers use various methods, such as studying the motions of galaxies and their orbits around the center of the Milky Way, to infer the existence and properties of dark matter.

Despite extensive searching over the last few decades, no direct evidence for dark matter particles has yet been discovered. Ho

'Dark matter is a theoretical particle that exists within the framework of astronomical and cosmological physics, though it has not yet been directly detected through laboratory experiments or other means. Dark matter makes up approximately 85% of the matter in the universe, while visible matter (which includes stars, planets, gases, and dust) only accounts for about 5% of the total mass-energy density of the observable universe.\n\nThis discrepancy between visible matter and dark matter is known as the "missing mass" problem. Dark matter is believed to interact with ordinary matter through gravity only, and it does not emit or absorb light, making its detection extremely challenging. Researchers use various methods, such as studying the motions of galaxies and their orbits around the center of the Milky Way, to infer the existence and properties of dark matter.\n\nDespite extensive searching over the last few decades, no direct evidence for dark matter particles has yet been discovere

In [None]:
# TODO: Fix module paths
qdrant_path2 = "resources/data/qdrant/usrse_qdrant/"

# TODO: Change collection name to 
qdrant_collection2 = "astropy_docs"

if os.path.exists(qdrant_path):
    print(f"Loading existing Qdrant collection '{qdrant_collection2}'")
    
    client2 = QdrantClient(path=qdrant_path2)
    
    qdrant2 = Qdrant(
        client=client2,
        collection_name=qdrant_collection2,
        embeddings=model
    )


In [None]:

retriever = qdrant2.as_retriever(search_type="mmr", search_kwargs={"k": 2})

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs[0])
    #return "\n\n".join(doc.page_content for doc in docs)



question = "How can I perform celestial coordinate transformations?"


In [None]:
docs = retriever.invoke(question)

context = format_docs(retriever.invoke(question))

In [None]:
docs[0]

In [None]:
len(context.split()) *3

In [None]:

context = retriever.invoke(question)[0]

print(len(prompt_template.format(
    messages=[
        {
            "role": "user", 
            "content": f"""You are an expert at the astrophysics package Astropy. Please answer the question on Astropy based on the following context:

            Context: {context}
            
            Question: {question}"""
        }
    ], 
    add_generation_prompt=True, 
    eos_token="<|endoftext|>"
).split()))

print(prompt_template.template)

In [None]:

llm_chain = prompt_template | olmo


In [None]:
print(llm_chain)

In [None]:

# llm_chain.invoke(
#     {
#         "messages":
#             [
#         ], 
#         "add_generation_prompt": True, 
#         "eos_token": "<|endoftext|>",
#     },
#     config={
#         'callbacks' : [StreamingStdOutCallbackHandler()]
#     }
# )

from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor


compressor = LLMChainExtractor.from_llm(olmo)

compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
)

compressed_docs = compression_retriever.invoke("How can I perform celestial coordinate transformations?")



In [None]:



llm_chain.invoke(
    {
        "messages":
            [{
                "role": "user", 
                "content": f"""You are an expert at the astrophysics package Astropy. Please answer the question on Astropy based on the following context:
    
                Context: {context}
                
                Question: {question}"""
            }
        ], 
        "add_generation_prompt": True, 
        "eos_token": "<|endoftext|>",
    },
    config={
        'callbacks' : [StreamingStdOutCallbackHandler()]
    }
)


## Retrieving Github docs



## Vectorization and Embedding



## Retrieval



## Prompting



## Final Results



## To see more, visit SSEC Tutorials! :D <3