In [120]:
from langchain_community.document_loaders import ArxivLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings

import os
import getpass

from langchain.chains import RetrievalQA
from langchain_openai import OpenAI

from dotenv import load_dotenv

from langchain_google_genai import ChatGoogleGenerativeAI

import arxiv

### Arxivloader ###

Info. - https://python.langchain.com/v0.2/docs/integrations/document_loaders/arxiv/

This would be super helpful as this will help u


In [109]:
### Getting relevant paper from arxiv #####

query = "FRB 20180916B, "
query = "The Impact of Positive AGN Feedback on the Properties of Galaxies in a Semi-Analytic Model of Galaxy Formation"
arxiv_docs = ArxivLoader(query=query, load_max_docs=3).load() #### Loads number of paper given the query

### Text splitting ###
Info - https://python.langchain.com/v0.1/docs/modules/data_connection/document_transformers/recursive_text_splitter/

In [122]:
##### splitting data ####

pdf_data = []
for doc in arxiv_docs:
    text_splitter = RecursiveCharacterTextSplitter(
                    chunk_size=1000,
                    chunk_overlap=100)
    texts = text_splitter.create_documents([doc.page_content])
    pdf_data.append(texts)


### Embedding ####

Info - https://python.langchain.com/v0.2/docs/integrations/platforms/huggingface/

In [123]:

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-l6-v2")
db = Chroma.from_documents(pdf_data[0], embeddings)

In [124]:
os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter your Google AI API key: ")

Enter your Google AI API key: ··········


In [125]:
llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro",
                             temperature=0,
                             max_tokens=None,
                             timeout=None,
                             max_retries=2,)

qa = RetrievalQA.from_chain_type(llm=llm,
                                 chain_type="stuff",
                                 retriever=db.as_retriever())

In [131]:
question = "What was the total exposure on the source?"
result = qa({"query": question})

print(result)

{'query': 'What was the total exposure on the source?', 'result': 'The total exposure on the source in the aforementioned time interval is 201 hours. \n'}


In [132]:
question = "Whats the limit on period derivative?"
result = qa({"query": question})

print(result)

{'query': 'Whats the limit on period derivative?', 'result': "I'm sorry, but the text provided does not mention a specific limit on the period derivative. It only mentions a value of 1.5 × 10−4 day day−1 for the absolute period derivative. \n"}


In [129]:
question = "Summarize the paper in a paragraph"
result = qa({"query": question})

print(result)

{'query': 'Summarize the paper in a paragraph', 'result': "I'm sorry, but I cannot summarize the paper based on the provided snippets. The context only provides scattered phrases and doesn't reveal the paper's actual content or arguments.  Please provide more information from the paper for a proper summary. \n"}


In [130]:
question = "Tell me 7 facts about FRB 20180916B"
result = qa({"query": question})

print(result)

{'query': 'Tell me 7 facts about FRB 20180916B', 'result': 'The provided text only gives us a few details about FRB 20180916B and is not enough to extract 7 facts. \n\nHere is what we know:\n\n1. **FRB 20180916B has a significantly lower Rotation Measure (RM) than FRB 20121102A.** It is mentioned that the difference is four orders of magnitude.\n2. **The lower RM of FRB 20180916B suggests something about its immediate environment.** The exact implication is not stated in the provided text. \n\nWe need more information to provide 7 facts about FRB 20180916B. \n'}


###Using Retrieval Augmented Dual Instruction Tuning (RA-DIT)###

Info: https://cobusgreyling.medium.com/fine-tuning-llms-with-retrieval-augmented-generation-rag-c66e56aec858

---



In [134]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core import VectorStoreIndex
from llama_index.core import PromptTemplate
#from llama_index.core import SimpleDocument  # Import directly from llama_index

# Define the prompt template
qa_prompt_tmpl_str = (
    "Context information is below.\n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n"
    "Given the context information and not prior knowledge, "
    "answer the query.\n"
    "Query: {query_str}\n"
    "Answer: "
)
qa_prompt_tmpl = PromptTemplate(qa_prompt_tmpl_str)

# Create a list of Document objects
# Replace with your actual document content
#documents = [
#    SimpleDocument(text="burst rate and morphological evolution of the periodically repeating FRB 20180916B."),
#    SimpleDocument(text="arxiv_docs")
##]

#documents = [SimpleDocument(text=text) for text in pdf_data]


# Create an instance of VectorStoreIndex
vector_index = VectorStoreIndex.from_documents(pdf_data)

# Set up the vector retriever
vector_retriever = vector_index.as_retriever(similarity_top_k=1)


AttributeError: 'list' object has no attribute 'get_doc_id'

In [None]:
def save_openai_data(dataset, out_path):
    # out_fp = open("data_rag/qa_pairs_openai.jsonl", "w")
    out_fp = open(out_path, "w")
    # TODO: try with different system prompts
    system_prompt = {
        "role": "system",
        "content": "You are a helpful assistant helping to answer questions about the Llama 2 paper.",
    }
    train_qr_pairs = dataset.qr_pairs
    for line in train_qr_pairs:
        query, response = line
        user_prompt = {"role": "user", "content": query}
        assistant_prompt = {"role": "assistant", "content": response}
        out_dict = {
            "messages": [system_prompt, user_prompt, assistant_prompt],
        }
        out_fp.write(json.dumps(out_dict) + "\n")
save_openai_data(train_dataset, "data_rag/qa_pairs_openai.jsonl")