I'm using [Chris Alexiuk's](https://www.linkedin.com/in/csalexiuk/) [notebook](https://colab.research.google.com/drive/172uMprWwUfEecXQWBrsgDAlkpT_EK39z?usp=sharing)
as a starting point and plan to experiment with some of the ideas from [A Practical Approach to Retrieval Augmented Generation Systems](https://angelinamagr.gumroad.com/l/practical-approach-to-RAG-systems) by Allahyari and Yang.

## Steps
- Experiment with Mistral 7B.  May reduce hallucinations in prompt responses and be faster at inference than Zephyr 7B, which is critical. On the other hand, may require more data to finetune later since it's already aligned with synthetic datasets (or may not if use it on nonsynthetic ArXiv!)
- Also, Zephyr's data included openbmb/UltraFeedback, which included some ArXiv papers, so it may finetune better.
- Be careful if finetune on abstracts only as that may not be representative unless initial data RAGs over abstract as well.  Something to test, though.
- CONCLUSION: For task at hand, Mistral-7B-Instruct is much faster due to grouped-query attention (GQA) and sliding window attention (SWA).  Subjectively, I find that the responses are at least as good as Zephyr's and have not spotted an extreme hallucination problem.  Proceding with Mistral-7B-Instruct for now.

## Get the data and build a Retriever

- Original NB worked in under 10GB on V100

In [None]:
!pip install -U -q "langchain" "transformers==4.35.0" "datasets==2.12.0" "tokenizers==0.14.0" "peft==0.4.0" "accelerate==0.21.0" "bitsandbytes==0.40.2" "trl==0.4.7" "safetensors>=0.3.1" "arxiv==1.4"
!pip install -U -q cohere llama-index
!pip install PyPDF2
!pip install pypdf
!pip install -q qdrant-client
!pip install -q -U faiss-cpu tiktoken sentence-transformers


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/123.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m123.1/123.1 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m8.3 MB/s[0m eta [36m0:00:

In [None]:
import transformers, datasets, tokenizers
transformers.__version__, datasets.__version__, tokenizers.__version__

('4.51.0', '2.12.0', '0.21.1')

In [None]:
import os
from google.colab import drive
drive.mount('/content/drive/')

output_dir = '/content/drive/MyDrive/PdfRag/rag_output_dir'
logging_dir = '/content/drive/MyDrive/PdfRag/rag_logging_dir'
index_dir = '/content/drive/MyDrive/PdfRag/rag_index_dir'

#!ls /content/drive/MyDrive/PdfRag/clusterofstars
%cd /content/drive/MyDrive/PdfRag
#My\ Drive/PdfRag && ls clusterofstars
!ls .

Mounted at /content/drive/
[Errno 2] No such file or directory: '/content/drive/MyDrive/PdfRag'
/content
drive  sample_data


- The documents consist of a few dozen ArXiv papers about modern LLMs

In [None]:
from pathlib import Path
PDFS_PATH = Path('/content')
PDFS = list(PDFS_PATH.glob('*.pdf'))
PDFS[0], len(PDFS)

(PosixPath('/content/Understanding_Climate_Change.pdf'), 1)

In [None]:
from pathlib import Path

# Verify the path to the PDF files
PDFS_PATH = Path('/content')
# Print the files found in the path to ensure they exist
print(list(PDFS_PATH.glob('*')))
# Print the path to check if it is correct
print(PDFS_PATH)

# List all PDF files in the directory
PDFS = list(PDFS_PATH.glob('*.pdf'))

# Check if PDFS is empty and provide a message if so
if not PDFS:
    print("No PDF files found in the specified directory.")
else:
    # Proceed if PDF files are found
    print(PDFS[0], len(PDFS))

[PosixPath('/content/.config'), PosixPath('/content/drive'), PosixPath('/content/Understanding_Climate_Change.pdf'), PosixPath('/content/sample_data')]
/content
/content/Understanding_Climate_Change.pdf 1


In [None]:
PDFS

[PosixPath('/content/Understanding_Climate_Change.pdf')]

In [None]:
# fastai function to clean GPU memory
import sys,gc,traceback
import torch
def clean_ipython_hist():
    # Code in this function mainly copied from IPython source
    if not 'get_ipython' in globals(): return
    ip = get_ipython()
    user_ns = ip.user_ns
    ip.displayhook.flush()
    pc = ip.displayhook.prompt_count + 1
    for n in range(1, pc): user_ns.pop('_i'+repr(n),None)
    user_ns.update(dict(_i='',_ii='',_iii=''))
    hm = ip.history_manager
    hm.input_hist_parsed[:] = [''] * pc
    hm.input_hist_raw[:] = [''] * pc
    hm._i = hm._ii = hm._iii = hm._i00 =  ''



def clean_tb():
    # h/t Piotr Czapla
    if hasattr(sys, 'last_traceback'):
        traceback.clear_frames(sys.last_traceback)
        delattr(sys, 'last_traceback')
    if hasattr(sys, 'last_type'): delattr(sys, 'last_type')
    if hasattr(sys, 'last_value'): delattr(sys, 'last_value')

def clean_mem():
    clean_tb()
    clean_ipython_hist()
    gc.collect()
    torch.cuda.empty_cache()



### Task 1: Prepare the data and  build a PDF Data Loader

In [None]:
from PyPDF2 import PdfReader
reader = PdfReader(os.path.expanduser(PDFS[0]))
pages = reader.pages
documents = []
for page in pages:
  documents.append(page.extract_text())
#print(documents[-1])

In [None]:
from pathlib import Path
import os
from PyPDF2 import PdfReader

# Verify the path to the PDF files
PDFS_PATH = Path('/content')
# Print the files found in the path to ensure they exist
print(list(PDFS_PATH.glob('*')))
# Print the path to check if it is correct
print(PDFS_PATH)

# List all PDF files in the directory
PDFS = list(PDFS_PATH.glob('*.pdf'))

# Check if PDFS is empty and provide a message if so
if not PDFS:
    print("No PDF files found in the specified directory. Please check the path.")
else:
    # Proceed if PDF files are found
    print(PDFS[0], len(PDFS))
    reader = PdfReader(os.path.expanduser(PDFS[0])) #This line was moved inside the else block
    pages = reader.pages
    documents = []
    for page in pages:
      documents.append(page.extract_text())
    #print(documents[-1])

[PosixPath('/content/.config'), PosixPath('/content/drive'), PosixPath('/content/Understanding_Climate_Change.pdf'), PosixPath('/content/sample_data')]
/content
/content/Understanding_Climate_Change.pdf 1


#### First drop everything from References onwards. References were 'confusing' RAG into retrieving primarily titles of papers mentioned there, which is likely not very useful

In [None]:
import PyPDF2

def load_pdf_to_string(pdf_path):
    # Open the PDF file in binary mode
    with open(pdf_path, 'rb') as file:
        # Create a PDF file reader object
        pdf_reader = PyPDF2.PdfReader(file)

        # Initialize an empty string to hold the text
        text = ''

        # Loop through each page and extract the text
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            page_text = page.extract_text()
            references_index= page_text.upper().find('\nREFERENCES\n')
            if references_index != -1:
              page_text = page_text[:references_index]
              text += page_text
              return text
            text += page_text
    return text

# Use the function to load a PDF into a string
text = load_pdf_to_string(os.path.expanduser(PDFS[1]))

IndexError: list index out of range

In [None]:
from pathlib import Path
import os
from PyPDF2 import PdfReader

# Verify the path to the PDF files
PDFS_PATH = Path('/content')
# Print the files found in the path to ensure they exist
print(list(PDFS_PATH.glob('*')))
# Print the path to check if it is correct
print(PDFS_PATH)

# List all PDF files in the directory
PDFS = list(PDFS_PATH.glob('*.pdf'))

# Check if PDFS is empty and provide a message if so
if not PDFS:
    print("No PDF files found in the specified directory. Please check the path.")
else:
    # Proceed if PDF files are found
    print(PDFS[0], len(PDFS))

    # Check if there's more than one PDF file
    if len(PDFS) > 1:
        text = load_pdf_to_string(os.path.expanduser(PDFS[1])) #This line was moved inside the else block
    else:
        print("Only one PDF file found. Using the first one.")
        text = load_pdf_to_string(os.path.expanduser(PDFS[0]))

[PosixPath('/content/.config'), PosixPath('/content/drive'), PosixPath('/content/Understanding_Climate_Change.pdf'), PosixPath('/content/sample_data')]
/content
/content/Understanding_Climate_Change.pdf 1
Only one PDF file found. Using the first one.


In [None]:
def get_title(pdf_path): return os.path.expanduser(pdf_path).split('/')[-1]

In [None]:
get_title(PDFS[-1])

'Understanding_Climate_Change.pdf'

In [None]:
text.find('References\n')

-1

In [None]:
PDFS[0]

PosixPath('/content/Understanding_Climate_Change.pdf')

In [None]:
all_docs_and_titles = [(load_pdf_to_string(os.path.expanduser(pdf_path)),get_title(pdf_path)) for pdf_path in PDFS]

In [None]:
all_docs = [doc[0] for doc in all_docs_and_titles]
all_titles = [doc[1] for doc in all_docs_and_titles]

In [None]:
from langchain.document_loaders.onedrive_file import CHUNK_SIZE
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter, TokenTextSplitter, RecursiveCharacterTextSplitter
from langchain.text_splitter import Document

CHUNK_SIZE = 1000
CHUNK_OVERLAP = 30

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap = CHUNK_OVERLAP,
    length_function=len,
)
#text_splitter.split_text(all_pages[0])
# docs = [Document(page_content=pages) for pages in all_pages]
docs  = [text_splitter.split_text(doc) for doc in all_docs]
# # docs

ImportError: cannot import name 'CHUNK_SIZE' from 'langchain.document_loaders.onedrive_file' (/usr/local/lib/python3.11/dist-packages/langchain/document_loaders/onedrive_file.py)

In [None]:
!pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.21-py3-none-any.whl.metadata (2.4 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Downloading langchain_community-0.3.21-py3-none-any.whl (2.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m36.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydantic_settings-2.8.1-py3-none-any.whl (30 kB)
Installing collected packages: pydantic-settings, langchain-community
Successfully installed langchain-community-0.3.21 pydantic-settings-2.8.1


In [None]:
# Remove the import statement entirely:
#from langchain.document_loaders.onedrive_file import CHUNK_SIZE #This line was causing the issue and is not needed

from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter, TokenTextSplitter, RecursiveCharacterTextSplitter
from langchain.text_splitter import Document

CHUNK_SIZE = 1000 #Define CHUNK_SIZE directly in the code
CHUNK_OVERLAP = 30

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap = CHUNK_OVERLAP,
    length_function=len,
)
#text_splitter.split_text(all_pages[0])
# docs = [Document(page_content=pages) for pages in all_pages]
docs  = [text_splitter.split_text(doc) for doc in all_docs]
# # docs

ImportError: cannot import name 'Document' from 'langchain.text_splitter' (/usr/local/lib/python3.11/dist-packages/langchain/text_splitter.py)

In [None]:
# Remove the import statement entirely:
#from langchain.document_loaders.onedrive_file import CHUNK_SIZE #This line was causing the issue and is not needed

from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter, TokenTextSplitter, RecursiveCharacterTextSplitter
from langchain.schema.document import Document # Import Document from langchain.schema.document

CHUNK_SIZE = 1000 #Define CHUNK_SIZE directly in the code
CHUNK_OVERLAP = 30

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap = CHUNK_OVERLAP,
    length_function=len,
)
#text_splitter.split_text(all_pages[0])
# docs = [Document(page_content=pages) for pages in all_pages]
docs  = [text_splitter.split_text(doc) for doc in all_docs]
# # docs

In [None]:
len(docs)

1

In [None]:
tot_len = 0
for text in docs[0]:
    tot_len += len(text)
tot_len #OK, makes sense

73451

In [None]:
len(docs[0])

77

### Task 2: Create an "Index"

- Not yet sure if should use Qdrant or FAISS


#### Selecting the VectorStore


In [None]:
import os
os.environ['LD_LIBRARY_PATH'] = '/usr/local/cuda/lib64:' + os.environ.get('LD_LIBRARY_PATH', '')

In [None]:
from langchain.vectorstores import Qdrant, FAISS

In [None]:
!python -m bitsandbytes
!pip install bitsandbytes --force-reinstall --no-cache-dir

False


python -m bitsandbytes


  warn(msg)
The following directories listed in your path were found to be non-existent: {PosixPath('/usr/local/lib/python3.11/dist-packages/cv2/../../lib64')}
  warn(msg)
The following directories listed in your path were found to be non-existent: {PosixPath('/sys/fs/cgroup/memory.events /var/colab/cgroup/jupyter-children/memory.events')}
The following directories listed in your path were found to be non-existent: {PosixPath('8013'), PosixPath('//172.28.0.1'), PosixPath('http')}
The following directories listed in your path were found to be non-existent: {PosixPath('/backend-container/containers/build.constraints')}
The following directories listed in your path were found to be non-existent: {PosixPath('--logtostderr --listen_host=172.28.0.12 --target_host=172.28.0.12 --tunnel_background_save_url=https'), PosixPath('//colab.research.google.com/tun/m/cc48301118ce562b961b3c22d803539adc1e0c19/gpu-t4-s-1jb7pjynn9z4s --tunnel_background_save_delay=10s --tun

In [None]:
from langchain.embeddings import CacheBackedEmbeddings, HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore

store = LocalFileStore("./cache/")

embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'

core_embeddings_model = HuggingFaceEmbeddings(
    model_name=embed_model_id
)

embedder = CacheBackedEmbeddings.from_bytes_store(
    core_embeddings_model, store, namespace=embed_model_id
)


  core_embeddings_model = HuggingFaceEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

NameError: name 'init_empty_weights' is not defined

In [None]:
#vector_store = FAISS.from_documents(docs, embedder)
#vector_store = FAISS.from_documents((docs[i][j] for i in range(len(docs)) for j in range(len(docs[i]))), embedder)
#vector_store = FAISS.from_documents(docs, embedder)
#vector_store = FAISS.from_documents(docs[0], embedder)
from langchain.schema.document import Document

docs = [Document(page_content=doc[i],metadata={'source':all_titles[j]}) for j,doc in enumerate(docs) for i in range(len(doc))]
for index, pdf in enumerate(docs):
   content = docs[index]
   if index == 0:
       vector_store = FAISS.from_documents([content], embedder)
   else:
      vector_store_i = FAISS.from_documents([content], embedder)
      vector_store.merge_from(vector_store_i)

vector_store
#vector_store.save_local(index_dir)

In [None]:
vector_store.save_local(index_dir)

### To reload the embeddings made above on the next Colab nb use, run the code below.

In [None]:
from langchain.embeddings import CacheBackedEmbeddings, HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore
store = LocalFileStore("./cache/")

embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'

core_embeddings_model = HuggingFaceEmbeddings(
    model_name=embed_model_id
)
embedder = CacheBackedEmbeddings.from_bytes_store(
    core_embeddings_model, store, namespace=embed_model_id
)

vector_store = FAISS.load_local(index_dir, embedder)

Check that the VectorStore is working by embedding a query and retrieving passages from our reviews that are close to it.

In [None]:
query = "What is Retrieval-augmented generation?"
embedding_vector = core_embeddings_model.embed_query(query)
docs = vector_store.similarity_search_by_vector(embedding_vector, k = 4)

for page in docs:
  print(page.page_content)

In [None]:
query = "What is Self-Rag?"
embedding_vector = core_embeddings_model.embed_query(query)
docs = vector_store.similarity_search_by_vector(embedding_vector, k = 4)

for page in docs:
  print(page.page_content)

Let's see how much time the `CacheBackedEmbeddings` pattern saves us:

In [None]:
%%timeit -n 1 -r 1
query = "What is Self-Rag?"
embedding_vector = embedder.embed_query(query)
docs = vector_store.similarity_search_by_vector(embedding_vector, k = 4)

In [None]:
%%timeit
query = "What is Self-Rag?"
embedding_vector = embedder.embed_query(query)
docs = vector_store.similarity_search_by_vector(embedding_vector, k = 4)

As we can see, even over a significant number of runs - the cached query is significantly faster than the first instance of the query!

With that, we're ready to move onto Task 3!

### Task 3: Building a Retrieval Chain

In [None]:
from huggingface_hub import notebook_login

notebook_login()

We will be leveraging Tim Dettmer's `bitsandbytes` as well as `accelerate` and `transformers` from Hugging Face to make our model as small as possible. The overall quality of the model is fairly well retained!

In [None]:
transformers.__version__

In [None]:
import torch
import transformers
# BitsAndBytes for 4-bit quantization with NF4-type configuration to load  model in 4-bit precision.
# Will help load the model faster and reduce the memory footprint so that it can be run on Google Colab.
model_id = "mistralai/Mistral-7B-Instruct-v0.1"
bng_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

model_config = transformers.AutoConfig.from_pretrained(
    model_id
)


model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bng_config, #using bnb_config
    device_map='auto'
)

model.eval()

In [None]:
# Load the corresponding tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id
)

Now we need to pack it into a `pipeline` for compatability with `langchain`!

In [None]:
 #contains highly optimized components some of which are not yet available in PyTorch
!pip install xformers

In [None]:
generate_text = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    return_full_text=True,
    temperature=0.0,
    max_new_tokens=256
) # Get a cudann warning, likely since using T4 vs, say A100

In [None]:
from langchain.llms import HuggingFacePipeline

llm = HuggingFacePipeline(pipeline=generate_text)

Now we can set up our chain.

In [None]:
retriever = vector_store.as_retriever()

In [None]:
from langchain.chains import RetrievalQA,RetrievalQAWithSourcesChain
from langchain.callbacks import StdOutCallbackHandler

handler = StdOutCallbackHandler()

qa_with_sources_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    callbacks=[handler],
    return_source_documents=True
)
# qa_with_sources_chain = RetrievalQAWithSourcesChain.from_chain_type(
#     llm=llm,
#     retriever=retriever,
#     callbacks=[handler],
#     return_source_documents=True
# )

- Try using RetrievalQAWithSourcesChain

In [None]:
#qa_with_sources_chain({"question" : "What makes Self-Rag different from Rag?"})

- Well, so far my use of RetrievalQAWithSourcesChain does not seem to be working well at all!!!  Go back to RetrievalQA

In [None]:
qa_with_sources_chain({"query" : "How does Self-Rag compare to Rag?"})

- However, adding 'source' metadata to Document yielded a result that has has article title, which was one of the desired results.  If end up needing filtering articles by metadata with FAISS, see https://github.com/langchain-ai/langchain/discussions/10983.

In [None]:
qa_with_sources_chain({"query" : "What is QLoRa?"})

In [None]:
qa_with_sources_chain({"query" : "Did these papers explore themes of existentialism?"})

In [None]:
#qa_with_sources_chain({"query" : " Open large language models (LLMs) with great performance in various tasks have significantly advanced the development of LLMs. However, they are far inferior to commercial models such as ChatGPT and GPT-4 when acting as agents to tackle complex tasks in the real world. These agent tasks employ LLMs as the central controller responsible for planning, memorization, and tool utilization, necessitating both fine-grained prompting methods and robust LLMs to achieve satisfactory performance. Though many prompting methods have been proposed to complete particular agent tasks, there is lack of research focusing on improving the agent capabilities of LLMs themselves without compromising their general abilities. In this work, we present AgentTuning, a simple and general method to enhance the agent abilities of LLMs while maintaining their general LLM capabilities. We construct AgentInstruct, a lightweight instruction-tuning dataset containing high-quality interaction trajectories. We employ a hybrid instruction-tuning strategy by combining AgentInstruct with open-source instructions from general domains. AgentTuning is used to instruction-tune the Llama 2 series, resulting in AgentLM. Our evaluations show that AgentTuning enables LLMs' agent capabilities without compromising general abilities. The AgentLM-70B is comparable to GPT-3.5-turbo on unseen agent tasks, demonstrating generalized agent capabilities. We open source the AgentInstruct and AgentLM-7B, 13B, and 70B models at https://github.com/THUDM/AgentTuning, serving open and powerful alternatives to commercial LLMs for agent tasks. "})

In [None]:
from langchain.document_loaders import WebBaseLoader
from langchain.chains.summarize import load_summarize_chain

In [None]:
query="Open large language models (LLMs) with great performance in various tasks have significantly advanced the development of LLMs. However, they are far inferior to commercial models such as ChatGPT and GPT-4 when acting as agents to tackle complex tasks in the real world. These agent tasks employ LLMs as the central controller responsible for planning, memorization, and tool utilization, necessitating both fine-grained prompting methods and robust LLMs to achieve satisfactory performance. Though many prompting methods have been proposed to complete particular agent tasks, there is lack of research focusing on improving the agent capabilities of LLMs themselves without compromising their general abilities. In this work, we present AgentTuning, a simple and general method to enhance the agent abilities of LLMs while maintaining their general LLM capabilities. We construct AgentInstruct, a lightweight instruction-tuning dataset containing high-quality interaction trajectories. We employ a hybrid instruction-tuning strategy by combining AgentInstruct with open-source instructions from general domains. AgentTuning is used to instruction-tune the Llama 2 series, resulting in AgentLM. Our evaluations show that AgentTuning enables LLMs' agent capabilities without compromising general abilities. The AgentLM-70B is comparable to GPT-3.5-turbo on unseen agent tasks, demonstrating generalized agent capabilities. We open source the AgentInstruct and AgentLM-7B, 13B, and 70B models at https://github.com/THUDM/AgentTuning, serving open and powerful alternatives to commercial LLMs for agent tasks. "

In [None]:
os.listdir('.')

In [None]:
chain = load_summarize_chain(llm, chain_type="stuff")
from langchain.document_loaders import TextLoader
query = TextLoader('testabstract').load()
#query = query_loader.load({"text" : "How does Self-Rag compare to Rag?"})
query="Which papers are most similar to the article with the following summary?  Article summary: " + chain.run(query)

In [None]:
qa_with_sources_chain({"query" : query})

In [None]:
chain = load_summarize_chain(llm, chain_type="stuff")
from langchain.document_loaders import TextLoader
query = TextLoader('instructionmining').load()
#query = query_loader.load({"text" : "How does Self-Rag compare to Rag?"})
query="Which papers are most similar to the article with the following summary?  Article summary: " + chain.run(query)

In [None]:
qa_with_sources_chain({"query" : query})

In [None]:
chain = load_summarize_chain(llm, chain_type="stuff")
from langchain.document_loaders import TextLoader
query = TextLoader('instructionmining').load()
#query = query_loader.load({"text" : "How does Self-Rag compare to Rag?"})
query="Which papers are most different to the article with the following summary?  Article summary: " + chain.run(query)
qa_with_sources_chain({"query" : query})

### ArXiv API

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"
locale.getpreferredencoding() ### SOLVED THE UTF-8 ISSUE!

In [None]:
#from langchain.retrievers import ArxivRetrieverM # GETTING ERROR, probably updated API
from langchain.retrievers.arxiv import ArxivRetriever

In [None]:
retriever = ArxivRetriever(load_max_docs=2)

In [None]:
docs = retriever.get_relevant_documents(query="2311.05610")

In [None]:
docs

In [None]:
docs[0].page_content

In [None]:
title, query = docs[0].metadata['Title'],docs[0].page_content.replace('\n', ' ')

In [None]:
query="Be thorough and explain your reasoning step by step.  Which of the retrieved documents has the most in common with the article with the following summary?  .  Article summary: " + query
qa_with_sources_chain({"query" : query})

In [None]:
clean_mem()