In [1]:
# https://blog.gopenai.com/chunking-pdfs-and-multimodal-documents-efficient-methods-for-handling-text-tables-and-images-for-467472f02d34
# https://blog.gopenai.com/rag-loading-multimodal-documents-into-vector-databases-with-microsoft-phi-3-phi-3-vision-and-30b142e0d26e
# https://www.pragnakalp.com/leverage-phi-3-exploring-rag-based-qna-with-microsofts-phi-3/
!pip install git+https://github.com/huggingface/transformers
!pip install langchain chromadb pypdf openai sentence-transformers accelerate

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to c:\users\acer alan\appdata\local\temp\pip-req-build-38xp49f3
  Resolved https://github.com/huggingface/transformers to commit 083e13b7c47f674b11c74d1b7c7ee7cd1241b406
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'


  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers 'C:\Users\acer alan\AppData\Local\Temp\pip-req-build-38xp49f3'




In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from transformers import AutoTokenizer,AutoModelForCausalLM,pipeline
from langchain import HuggingFacePipeline
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate

model_kwargs = {'device':'cuda'}
embeddings = HuggingFaceEmbeddings(model_kwargs=model_kwargs)

tokenizer = AutoTokenizer.from_pretrained("C:\\Users\\acer alan\\Desktop\\Phi3Demo\\Phi-3-mini-128k")
model = AutoModelForCausalLM.from_pretrained("C:\\Users\\acer alan\\Desktop\\Phi3Demo\\Phi-3-mini-128k",device_map="cuda",torch_dtype="auto",attn_implementation="flash_attention_2",trust_remote_code=True)

pipe = pipeline("text-generation",model=model,tokenizer=tokenizer,max_new_tokens=300)
llm = HuggingFacePipeline(pipeline=pipe)




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
pdf_link = "C:\\Users\\acer alan\\Desktop\\table_test\\2404.14219v2.pdf"
loader = PyPDFLoader(pdf_link,extract_images=False)
pages = loader.load_and_split()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=20,
    length_function=len,
    add_start_index = True,
)

chunks = text_splitter.split_documents(pages)

  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4


In [4]:
db = Chroma.from_documents(chunks,embedding=embeddings,persist_directory="text_index")
db.persist()

In [5]:
vector = Chroma(persist_directory="text_index",embedding_function=embeddings)
retriever = vector.as_retriever(search_kwargs = {"k" : 3})

In [6]:
qna_prompt_template="""<|system|>
You have been provided with the context and a question, try to find out the answer to the question only using the context information. If the answer to the question is not found within the context, return "I dont know" as the response.<|end|>
<|user|>
Context:
{context}

Question: {question}<|end|>
<|assistant|>"""
PROMPT = PromptTemplate(
   template=qna_prompt_template, input_variables=["context", "question"]
)

# Define the QNA chain
chain = load_qa_chain(llm, chain_type="stuff", prompt=PROMPT)

In [7]:
# A utility function for answer generation
def ask(question):
   context = retriever.get_relevant_documents(question)
   print(context)

   answer = (chain({"input_documents": context, "question": question}, return_only_outputs=True))['output_text']
   return answer

In [9]:
# Take the user input and call the function to generate output
user_question = input("User: ")
answer = ask(user_question)
answer = (answer.split("<|assistant|>")[-1]).strip()
print("Answer:", answer)

[Document(page_content='Phi-3-mini\n3.8bPhi-3-small\n7b (preview)Phi-3-medium\n14b (preview)Phi-2\n2.7bMistral\n7bGemma\n7bLlama-3-In\n8bMixtral\n8x7bGPT-3.5\nversion 1106\nMMLU\n(5-Shot) [HBK+21]68.8 75.3 78.2 56.3 61.7 63.6 66.0 68.4 71.4\nHellaSwag\n(5-Shot) [ZHB+19]76.7 78.7 83.0 53.6 58.5 49.8 69.5 70.4 78.8\nANLI\n(7-Shot) [NWD+20]52.8 55.0 58.7 42.5 47.1 48.7 54.8 55.2 58.1\nGSM-8K\n(0-Shot; CoT) [CKB+21]82.5 88.9 90.3 61.1 46.4 59.8 77.4 64.7 78.1\nMedQA\n(2-Shot) [JPO+20]53.8 58.2 69.4 40.9 49.6 50.0 58.9 62.2 63.4\nAGIEval\n(0-Shot) [ZCG+23]37.5 45.0 48.4 29.8 35.1 42.1 42.0 45.2 48.4\nTriviaQA\n(5-Shot) [JCWZ17]64.0 59.1 75.6 45.2 72.3 75.2 73.6 82.2 85.8\nArc-C\n(10-Shot) [CCE+18]84.9 90.7 91.0 75.9 78.6 78.3 80.5 87.3 87.4\nArc-E\n(10-Shot) [CCE+18]94.6 97.1 97.8 88.5 90.6 91.4 92.3 95.6 96.3\nPIQA\n(5-Shot) [BZGC19]84.2 87.8 87.7 60.2 77.7 78.1 77.1 86.0 86.6\nSociQA\n(5-Shot) [BZGC19]76.6 79.0 80.2 68.3 74.6 65.5 73.2 75.9 68.3\nBigBench-Hard\n(0-Shot) [SRR+22, SSS+22]71