In [1]:
from langchain_community.document_loaders import ArxivLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings

import os 
import getpass

from langchain.chains import RetrievalQA
from langchain_openai import OpenAI

from dotenv import load_dotenv

from langchain_google_genai import ChatGoogleGenerativeAI

### Arxivloader ###

Info. - https://python.langchain.com/v0.2/docs/integrations/document_loaders/arxiv/

This would be super helpful as this will help u

In [76]:
### Getting relevant paper from arxiv #####

# ['metadata','page_content']
query = "recent progress on theoretical studies of twisted MoTe2"
arxiv_docs = ArxivLoader(query=query, load_max_docs=5).load() #### Loads number of paper given the query

In [85]:
?ArxivLoader

[0;31mInit signature:[0m
[0mArxivLoader[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mquery[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdoc_content_chars_max[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mint[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m**[0m[0mkwargs[0m[0;34m:[0m [0mAny[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Load a query result from `Arxiv`.
The loader converts the original PDF format into the text.

Setup:
    Install ``arxiv`` and ``PyMuPDF`` packages.
    ``PyMuPDF`` transforms PDF files downloaded from the arxiv.org site
    into the text format.

    .. code-block:: bash

        pip install -U arxiv pymupdf


Instantiate:
    .. code-block:: python

        from langchain_community.document_loaders import ArxivLoader

        loader = ArxivLoader(
            query="reasoning",
            # load_max_docs=2,
          

In [32]:
for s in arxiv_docs[0].__dir__():
    if s[0]!='_':
        print('"'+s+'"')

"id"
"metadata"
"page_content"
"type"
"is_lc_serializable"
"get_lc_namespace"
"lc_secrets"
"lc_attributes"
"lc_id"
"Config"
"to_json"
"to_json_not_implemented"
"dict"
"json"
"parse_obj"
"parse_raw"
"parse_file"
"from_orm"
"construct"
"copy"
"schema"
"schema_json"
"validate"
"update_forward_refs"


In [36]:
arxiv_docs[0].to_json()

{'lc': 1,
 'type': 'constructor',
 'id': ['langchain', 'schema', 'document', 'Document'],
 'kwargs': {'metadata': {'Published': '2023-07-11',
   'Title': 'A CHIME/FRB study of burst rate and morphological evolution of the periodically repeating FRB 20180916B',
   'Authors': 'Ketan R. Sand, Daniela Breitman, Daniele Michilli, Victoria M. Kaspi, Pragya Chawla, Emmanuel Fonseca, Ryan Mckinven, Kenzie Nimmo, Ziggy Pleunis, Kaitlyn Shin, Bridget C. Andersen, Mohit Bhardwaj, P. J. Boyle, Charanjot Brar, Tomas Cassanelli, Amanda M. Cook, Alice P. Curtin, Fengqiu Adam Dong, Gwendolyn M. Eadie, B. M. Gaensler, Jane Kaczmarek, Adam Lanman, Calvin Leung, Kiyoshi W. Masui, Mubdi Rahman, Ayush Pandhi, Aaron B. Pearlman, Emily Petroff, Masoud Rafiei-Ravandi, Paul Scholz, Vishwangi Shah, Kendrick Smith, Ingrid Stairs, David C. Stenning',
   'Summary': 'FRB 20180916B is a repeating Fast Radio Burst (FRB) with a 16.3-day\nperiodicity in its activity. In this study, we present morphological properties\n

### Text splitting ###
Info - https://python.langchain.com/v0.1/docs/modules/data_connection/document_transformers/recursive_text_splitter/

In [79]:
##### splitting data ####

## added metadatas = [doc.metadata] to each chunk of data
pdf_data = []
for doc in arxiv_docs:
    text_splitter = RecursiveCharacterTextSplitter(
                    chunk_size=500,
                    chunk_overlap=10)
    texts = text_splitter.create_documents(texts=[doc.page_content],metadatas=[doc.metadata])
    # pdf_data.append(texts)
    for j in range(len(texts)):
        pdf_data.append(texts[j])


### Embedding ####

Info - https://python.langchain.com/v0.2/docs/integrations/platforms/huggingface/

In [80]:

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-l6-v2")
db = Chroma.from_documents(documents = pdf_data, embedding = embeddings) 

In [62]:
?Chroma.from_documents

[0;31mSignature:[0m
[0mChroma[0m[0;34m.[0m[0mfrom_documents[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdocuments[0m[0;34m:[0m [0;34m'List[Document]'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0membedding[0m[0;34m:[0m [0;34m'Optional[Embeddings]'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mids[0m[0;34m:[0m [0;34m'Optional[List[str]]'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcollection_name[0m[0;34m:[0m [0;34m'str'[0m [0;34m=[0m [0;34m'langchain'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpersist_directory[0m[0;34m:[0m [0;34m'Optional[str]'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mclient_settings[0m[0;34m:[0m [0;34m'Optional[chromadb.config.Settings]'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mclient[0m[0;34m:[0m [0;34m'Optional[chromadb.Client]'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m 

In [70]:
os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter your Google AI API key: ")

In [82]:
llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro",
                             temperature=0,
                             max_tokens=None,
                             timeout=None,
                             max_retries=2,)

qa = RetrievalQA.from_chain_type(llm=llm,
                                 chain_type="stuff",
                                 retriever=db.as_retriever())

I0000 00:00:1723737848.266633 13601294 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported
I0000 00:00:1723737848.266972 13601294 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported


In [84]:
question = "What are the topological and correlated phenomena in twisted MoTe2?"
result = qa({"query": question})

print(result)

{'query': 'What are the topological and correlated phenomena in twisted MoTe2?', 'result': "Based on the provided text, twisted MoTe2 exhibits these topological and correlated phenomena:\n\n* **Chern insulator behavior:** Notably at a twist angle of 2.75 degrees, exhibiting a similar G-valley moiré potential and pseudospin texture. (References 1-4, 43)\n* **Fractional Chern insulator behavior:** Observed in specific twisted MoTe2 configurations. (References 33, 44)\n* **Interplay between topology and correlations:** Particularly in the second moiré band, impacting band topology and potentially leading to different topological phases. (Reference 1)\n* **Quantum geometry influence:** The quantum geometry of the system is linked to the local density of states (LDOS) profile, which can be observed using scanning tunneling spectroscopy (STS). This connection is crucial for understanding the system's band topology. (Reference 1)\n* **Potential for other phases:**  Depending on the filling fa

In [73]:
question = "Whats the limit on period derivative?"
result = qa({"query": question})

print(result)

{'query': 'Whats the limit on period derivative?', 'result': 'The provided text states that the current period derivative limit is 1.5 × 10−4 days/day. \n'}


In [74]:
question = "Summarize the paper in a paragraph"
result = qa({"query": question})

print(result)

{'query': 'Summarize the paper in a paragraph', 'result': 'This document does not contain a paper to summarize. It lists publication details of several research papers, likely as part of a reference section for another paper.  I cannot provide a summary without the actual content of the paper these references belong to. \n'}


In [75]:
question = "Tell me 5 facts about FRB 20180916B"
result = qa({"query": question})

print(result)

{'query': 'Tell me 5 facts about FRB 20180916B', 'result': '1. **Predictable periodicity:** FRB 20180916B exhibits a predictable periodicity, which helps in planning multi-frequency follow-up observations.\n2. **Wide frequency range:** It has been detected across a wide range of radio frequencies, spanning over 4 octaves, from as low as 110 MHz.\n3. **Low RM compared to FRB 20121102A:** The Rotation Measure (RM) of FRB 20180916B is significantly smaller (four orders of magnitude) than that of FRB 20121102A.\n4. **Stable RM until 2021:**  The RM of FRB 20180916B remained relatively stable until the year 2021.\n5. **Linear RM increase observed:**  In 2023, a study by Mckinven et al. using CHIME/FRB data reported a linear increase in the RM of FRB 20180916B. \n'}
