# RAG Inference

This notebook follows `rag_create_index.ipynb` where we set up a RAG index. Here we actually use that index to interact with RAG-enabled cosmosage.

In [None]:
# load model, tokenizer, retriever
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
import torch
from auto_gptq import AutoGPTQForCausalLM
from transformers import AutoTokenizer, AutoModelForCausalLM

model_dir = "models/cosmosage_v2/"
#model = AutoGPTQForCausalLM.from_quantized(model_dir) # not yet supported
model = AutoModelForCausalLM.from_pretrained(model_dir, torch_dtype=torch.bfloat16).to('cuda')
tokenizer = AutoTokenizer.from_pretrained(model_dir)
embeddings = HuggingFaceEmbeddings(model_name='BAAI/bge-large-en-v1.5')
index = FAISS.load_local(f"datasets/faiss_index.bin", embeddings)
retriever = index.as_retriever(search_type="similarity", search_kwargs={'k': 4})

In [None]:
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from transformers import pipeline
from langchain.chains import LLMChain
from langchain.schema.runnable import RunnablePassthrough

text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.7,
    repetition_penalty=1.01,
    return_full_text=True,
    max_new_tokens=1000,
    do_sample=True,
)
llm = HuggingFacePipeline(pipeline=text_generation_pipeline)
prompt_template = """You are cosmosage, an AI assistant designed to give detailed and factual answers to questions about cosmology. Provide some relevant context before answering the question. You may find the following additional content helpful.
ADDITIONAL CONTENT: {context}
USER: {question}
ASSISTANT:"""
prompt = PromptTemplate(input_variables=["context", "question"], template=prompt_template)
llm_chain = LLMChain(llm=llm, prompt=prompt)
rag_chain = (
 {"context": retriever, "question": RunnablePassthrough()}
    | llm_chain
)

In [None]:
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from transformers import pipeline
from langchain.chains import LLMChain
from langchain.schema.runnable import RunnablePassthrough

# Assuming `model` and `tokenizer` are predefined
text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.7,
    repetition_penalty=1.01,
    return_full_text=True,
    max_new_tokens=1000,
    do_sample=True,
)

llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

# Updated prompt template to adhere to the provided instruction format
prompt_template = """<s> You are cosmosage, an AI programmed to provide excellent and detailed answers to the user's question. You are an expert cosmology assistant, able to answer questions on the cosmic microwave background, galaxy formation, large scale structure, theoretical cosmology, inflation, big bang nucleosynthesis, cosmology instrumentation, and other related topics. Please assume the user is fluent in scientific terminology. Elaborate where possible to give a complete answer. If you do not know, say you do not know.▁ USER: {question}▁ ASSISTANT:"""

prompt = PromptTemplate(input_variables=["question"], template=prompt_template)

llm_chain = LLMChain(llm=llm, prompt=prompt)

# Assuming `retriever` is defined and set up to fetch context
# The context retriever should be set up to run before the LLMChain
# Here, the context is not explicitly passed in the prompt template, so it's assumed to be part of the LLM's internal processing or retriever setup
rag_chain = RunnablePassthrough() | llm_chain


In [None]:
question = "What is meant by the Gaussian quadrature approximation to an actual bandpass integral?"

In [None]:
result = llm_chain.invoke({"context":"", "question": question})
print(result['text'])

In [None]:
result_rag = rag_chain.invoke(question)
print(result_rag['text'])

In [None]:
result_rag