In [None]:
from langchain_community.document_loaders import ArxivLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings

import os 
import getpass

from langchain.chains import RetrievalQA
from langchain_openai import OpenAI

from dotenv import load_dotenv

from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings

os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter your Google AI API key: ")

### Arxivloader ###

Info. - https://python.langchain.com/v0.2/docs/integrations/document_loaders/arxiv/

This would be super helpful as this will help u

In [None]:
### Getting relevant paper from arxiv #####

# ['metadata','page_content']
query = "recent progress on theoretical studies of twisted MoTe2"
arxiv_docs = ArxivLoader(query=query, load_max_docs=3, load_all_available_meta=True).load() #### Loads number of paper given the query
arxiv_docs = ArxivLoader(query=query, load_max_docs=3, load_all_available_meta=True).get_summaries_as_docs()

In [85]:
?ArxivLoader

[0;31mInit signature:[0m
[0mArxivLoader[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mquery[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdoc_content_chars_max[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mint[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m**[0m[0mkwargs[0m[0;34m:[0m [0mAny[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Load a query result from `Arxiv`.
The loader converts the original PDF format into the text.

Setup:
    Install ``arxiv`` and ``PyMuPDF`` packages.
    ``PyMuPDF`` transforms PDF files downloaded from the arxiv.org site
    into the text format.

    .. code-block:: bash

        pip install -U arxiv pymupdf


Instantiate:
    .. code-block:: python

        from langchain_community.document_loaders import ArxivLoader

        loader = ArxivLoader(
            query="reasoning",
            # load_max_docs=2,
          

In [32]:
for s in arxiv_docs[0].__dir__():
    if s[0]!='_':
        print('"'+s+'"')

"id"
"metadata"
"page_content"
"type"
"is_lc_serializable"
"get_lc_namespace"
"lc_secrets"
"lc_attributes"
"lc_id"
"Config"
"to_json"
"to_json_not_implemented"
"dict"
"json"
"parse_obj"
"parse_raw"
"parse_file"
"from_orm"
"construct"
"copy"
"schema"
"schema_json"
"validate"
"update_forward_refs"


### Text splitting ###
Info - https://python.langchain.com/v0.1/docs/modules/data_connection/document_transformers/recursive_text_splitter/

In [None]:
##### splitting data ####

## added metadatas = [doc.metadata] to each chunk of data
pdf_data = []
for doc in arxiv_docs:
    text_splitter = RecursiveCharacterTextSplitter(
                    chunk_size=500,
                    chunk_overlap=10)
    texts = text_splitter.create_documents(texts=[doc.page_content],metadatas=[doc.metadata])
    # pdf_data.append(texts)
    for j in range(len(texts)):
        pdf_data.append(texts[j])


### Embedding ####

Info - https://python.langchain.com/v0.2/docs/integrations/platforms/huggingface/

In [None]:
## embe
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-l6-v2")
db = Chroma.from_documents(documents = pdf_data, embedding= embeddings) 

In [19]:
?Chroma.from_documents

[0;31mSignature:[0m
[0mChroma[0m[0;34m.[0m[0mfrom_documents[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdocuments[0m[0;34m:[0m [0;34m'List[Document]'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0membedding[0m[0;34m:[0m [0;34m'Optional[Embeddings]'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mids[0m[0;34m:[0m [0;34m'Optional[List[str]]'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcollection_name[0m[0;34m:[0m [0;34m'str'[0m [0;34m=[0m [0;34m'langchain'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpersist_directory[0m[0;34m:[0m [0;34m'Optional[str]'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mclient_settings[0m[0;34m:[0m [0;34m'Optional[chromadb.config.Settings]'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mclient[0m[0;34m:[0m [0;34m'Optional[chromadb.Client]'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m 

In [None]:
llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro",
                             temperature=0,
                             max_tokens=None,
                             timeout=None,
                             max_retries=2,)

qa = RetrievalQA.from_chain_type(llm=llm,
                                 chain_type="stuff",
                                 retriever=db.as_retriever())

In [None]:
question = "What are the topological and correlated phenomena in twisted MoTe2?"
result = qa({"query": question})

print(result)

In [None]:
question = "Whats the limit on period derivative?"
result = qa({"query": question})

print(result)

In [None]:
question = "Summarize the paper in a paragraph"
result = qa({"query": question})

print(result)

In [None]:
question = "Tell me 5 facts about FRB 20180916B"
result = qa({"query": question})

print(result)

In [None]:
import google.generativeai as genai
import os

genai.configure(api_key="AIzaSyDpCp8WUjjaE3mJsOXcdxdlAihxuGjJf7E")

model = genai.GenerativeModel('gemini-1.5-pro')

In [None]:
response = model.generate_content("Write a story about an AI and magic")
print(response.text)