<a href="https://colab.research.google.com/github/terekli/RAG_assisted_research/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
# %pip install langchain
# %pip install -U langchain-community
# %pip install sentence-transformers
# %pip install chromadb
# %pip install bitsandbytes
# %pip install accelerate
# %pip install -i https://pypi.org/simple/ bitsandbytes
# %pip install transformers --upgrade

# Partition and embedd PDF manuscripts into vector database

In [None]:
import os
import pickle
import gc

from unstructured_client import UnstructuredClient
from unstructured_client.models import shared
from unstructured_client.models.errors import SDKError

from unstructured.partition.pdf import partition_pdf
from unstructured.staging.base import dict_to_elements

from langchain_core.documents import Document
from langchain_community.vectorstores import Chroma
from langchain.vectorstores import utils as chromautils
from langchain.embeddings import HuggingFaceEmbeddings

# Specify the directory of the folder containing all manuscripts in PDF
pdf_directory = '/Users/terekli/Desktop/naguib_pdf_set'

# Path to save the partitioned text and vector database (persist_directory)
output_directory = '/Users/terekli/Desktop/'
output_path_partitioned = os.path.join(output_directory, 'partitioned')
persist_directory = os.path.join(output_directory, 'chroma_persist')

# List all PDF files
files_and_dirs = os.listdir(pdf_directory)

 # Initialize the embeddings
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")

# Iterate over each manuscript
for pdfname in files_and_dirs:

    # Construct full PDF path
    full_pdf_path = os.path.join(pdf_directory, pdfname)

    # Skip .DS_Store files (for Mac)
    if pdfname == '.DS_Store':
        continue

    # If PDF name contains contains '/', replace with ' ' (space)
    if '//' in pdfname:
        pdfname = pdfname.replace('\\', ' ')
        old_pdf_path = full_pdf_path
        new_pdf_path = os.path.join(full_pdf_path, pdfname)
        # Rename the file
        os.rename(full_pdf_path, new_pdf_path)
        print(f"File renamed from {old_pdf_path} to {new_pdf_path}.")

    # Process each file
    print(f"Processing file: {pdfname}")

    # Partition !
    elements = partition_pdf(full_pdf_path, chunking_strategy='by_title', combine_text_under_n_chars=500)
    print(f"    Partitioned")
    # Save partitioned file as text
    textname = pdfname.replace('.pdf', '.txt')
    output_path_partitioned_textname = os.path.join(output_path_partitioned, textname)
    with open(output_path_partitioned_textname, 'w') as file:
        for element in elements:
            file.write(element.text + '\n')
    print(f"    Partition saved")

    # Embedding !
    documents = []
    for element in elements:
        # Keep file name to allow traceback to the originial manuscript
        metadata = {'file_name': element.metadata.filename}
        # Combine parition text with manuscript name
        document = Document(page_content=element.text, metadata=metadata)
        # Append the partitioned document to the list of full documents
        documents.append(document)

    # Save embedded vector database
    vectordb = Chroma.from_documents(documents, embeddings, persist_directory=persist_directory)
    print(f"    Embedded")
    vectordb.persist()
    print(f"    Embedding saved")
    del documents
    del elements
    gc.collect()

# Import vector database and build RAG model based on LLM

In [8]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from google.colab import drive

# drive.mount('/content/drive')

# Initialize the embeddings
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")

# Extract embedded vector database
persist_directory = '/content/drive/MyDrive/chroma_persist'
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings)

# # query text
# query_text = "mxene synthesize"

# # results = vectordb.search(query=query_text, search_type='similarity')
# results = vectordb.similarity_search(query=query_text, k=10)
# print("Search Results:")
# for document in results:
#     print(f"Manuscript Name: {document.metadata['file_name']}\n")
#     print(f"  Partition: {document.page_content} \n")

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import os
from huggingface_hub.hf_api import HfFolder

# LLM: https://huggingface.co/NousResearch/Hermes-2-Pro-Llama-3-8B
model_name = 'NousResearch/Hermes-2-Pro-Llama-3-8B'

HfFolder.save_token('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')

# Quantized version of the model can run on the free T4 in Colab
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    nb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Unused kwargs: ['nb_4bit_use_double_quant']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/57.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from transformers import pipeline
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

retriever = vectordb.as_retriever(search_type="similarity", search_kwargs={"k": 4})

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|im_end|>")
]

text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.2,
    do_sample=True,
    repetition_penalty=1.1,
    return_full_text=False,
    max_new_tokens=250,
    eos_token_id=terminators,
)

llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

prompt_template = """
<|im_start|>system
You are a helpful assistant.
You are given relevant documents for context and a question. Provide a conversational answer.
If you don't know the answer, just say "I do not know." Don't make up an answer.
Question: <|im_end|>
<|im_start|>user
{question}
Context: {context} <|im_end|>
"""

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [20]:
question = """
You are a research assistant for my lab.
You are give my lab's publications for the last 10 years.
Answer my question using these work.
Provide a conversational answer.
If you don't know the answer, just say 'I do not know'.
Don't make up an answer.
Question: Give me step-by-step guide to synthesize MXene, including concentration of chemicals needed.
"""

rag_chain.invoke(question)
# retriever.invoke(question)

'I do not know the specific step-by-step guide to synthesize MXene as the provided context does not contain enough information on the synthesis process. However, generally speaking, the synthesis of MXene involves the following steps:\n\n1. Select the appropriate precursor material, such as titanium dioxide (TiO2), which is commonly used for synthesizing MXene.\n2. Disperse the precursor material in a suitable solvent, typically water or another liquid medium.\n3. Add a solution containing a suitable intercalation agent, such as hydrofluoric acid (HF) or lithium fluoride (LiF), to the dispersion.\n4. Heat the mixture under controlled conditions, such as temperature and time, to facilitate the intercalation process.\n5. Remove the intercalation agent from the mixture through washing and filtration.\n6. Exfoliate the resulting material to obtain single-layer or multilayer MXene sheets.\n7. Finally, isolate and characterize the synthesized MXene using various analytical techniques, such a