In [1]:
!pip install -r requirements.txt



In [2]:
!pip install langchain_community



In [4]:
import fitz
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.question_answering import load_qa_chain
from langchain_community.llms import HuggingFacePipeline
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
from langchain.docstore.document import Document

from huggingface_hub import login
login()

model_id = 'mistralai/Mistral-7B-Instruct-v0.1'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map = 'auto', load_in_4bit = True)
qa_pipeline = pipeline('text-generation', model = model, tokenizer = tokenizer, max_new_tokens = 500)

# For small responses.
# qa_pipeline = pipeline('text2text-generation', model = 'google/flan-t5-small', max_new_tokens = 500)

embedder = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-MiniLM-L6-v2')
llm = HuggingFacePipeline(pipeline = qa_pipeline)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0
  embedder = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-MiniLM-L6-v2')
  llm = HuggingFacePipeline(pipeline = qa_pipeline)


In [5]:
def load_pdf(pdf_file):
    docs = fitz.open(list(pdf_file.keys())[0])
    text = '\n'.join([page.get_text() for page in docs])
    return text

def get_embeddings(text):
    splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 100)
    chunks = splitter.split_text(text)
    docs = [Document(page_content = chunk) for chunk in chunks]
    return FAISS.from_documents(docs, embedder)

def ask_question(vs, query):
    docs = vs.similarity_search(query, k = 3)
    chain = load_qa_chain(llm, chain_type = 'map_reduce')
    return chain.run(input_documents = docs, question = query)

In [8]:
import streamlit as st

st.set_page_config(page_title = "PDF Q&A", layout = 'wide')
st.title("PDF Q&A")

pdf_file = st.file_uploader('Upload a PDF file', type = 'pdf')
if pdf_file:
    with st.spinner('Running PDF...'):
        text = load_pdf(pdf_file)
        vs = get_embeddings(text)
        st.success("PDF Uploaded...")

query = st.text_input("Ask your Question : ")

if query:
            with st.spinner('Thinking...'):
                answer = ask_question(vs, query)
                st.write('Answer : ', answer)

st.markdown("### 🔍 Retrieved context:")
#for doc in docs:
#    st.markdown(f"• {doc.page_content[:300]}...")




DeltaGenerator()

In [7]:
from google.colab import files
uploaded = files.upload()

if uploaded:
  text = load_pdf(uploaded)
  vs = get_embeddings(text)
  print('PDF Uploaded...')

query = input("Ask You Question : ")
if query:
  answer = ask_question(vs, query)
  print("Answer : ", answer)

Saving nlp.pdf to nlp (1).pdf
PDF Uploaded...
Ask You Question : what is natural language processing?


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Answer :  Given the following extracted parts of a long document and a question, create a final answer. 
If you don't know the answer, just say that you don't know. Don't try to make up an answer.

QUESTION: Which state/country's law governs the interpretation of the contract?
Content: This Agreement is governed by English law and the parties submit to the exclusive jurisdiction of the English courts in  relation to any dispute (contractual or non-contractual) concerning this Agreement save that either party may apply to any court for an  injunction or other relief to protect its Intellectual Property Rights.

Content: No Waiver. Failure or delay in exercising any right or remedy under this Agreement shall not constitute a waiver of such (or any other)  right or remedy.

11.7 Severability. The invalidity, illegality or unenforceability of any term (or part of a term) of this Agreement shall not affect the continuation  in force of the remainder of the term (if any) and this Agreement.
