# References
- https://python.langchain.com/docs/use_cases/question_answering/
- https://gist.github.com/waleedkadous/aea1d312d68c9431949442cc562d5f2c

In [1]:
import sys
sys.version_info


sys.version_info(major=3, minor=10, micro=12, releaselevel='final', serial=0)

In [None]:
!pip3 install langchain==0.0.242
!pip3 install chromadb==0.4.2

In [9]:
# Constants
SCRAPE_URL = "https://medium.com/@symmetrics_hr/the-m%CC%B6o%CC%B6n%CC%B6k%CC%B6-immigrant-who-s%CC%B6o%CC%B6l%CC%B6d%CC%B6-bought-his-ferrari-e7be20c4d891"
DEFAULT_QUESTION = "What is H1b?"
SAMPLE_PDF_DOCUMENT = "Tech_Hubs_NOFO.pdf"


In [2]:
from langchain.embeddings.base import Embeddings
from typing import List
from sentence_transformers import SentenceTransformer

class LocalHuggingFaceEmbeddings(Embeddings):
    def __init__(self, model_id): 
        # Should use the GPU by default
        self.model = SentenceTransformer(model_id)
        
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """Embed a list of documents using a locally running
           Hugging Face Sentence Transformer model
        Args:
            texts: The list of texts to embed.
        Returns:
            List of embeddings, one for each text.
        """
        embeddings =self.model.encode(texts)
        return embeddings

    def embed_query(self, text: str) -> List[float]:
        """Embed a query using a locally running HF 
        Sentence trnsformer. 
        Args:
            text: The text to embed.
        Returns:
            Embeddings for the text.
        """
        embedding = self.model.encode(text)
        return list(map(float, embedding))

### HuggingFace
- For using transformers library to download models or interact with huggingface hub...
- On a shell prompt `huggingface-cli login` - complete the process
- Access token can be got using [help](https://huggingface.co/docs/hub/security-tokens)

# Implicit vectorstore
- Default is Chroma

In [None]:
# Document loader
from langchain.document_loaders import WebBaseLoader
from langchain.indexes import VectorstoreIndexCreator

loader = WebBaseLoader(SCRAPE_URL)
data = loader.load()
data

# Use GPT4All
- OpenAI can be used as well, however we wanted everything local

In [51]:
#from langchain.embeddings import OpenAIEmbeddings
#from langchain.embeddings.spacy_embeddings import SpacyEmbeddings
# from langchain.embeddings import GPT4AllEmbeddings
from langchain.indexes import VectorstoreIndexCreator
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import GPT4All

index = VectorstoreIndexCreator(embedding=HuggingFaceEmbeddings()).from_loaders([loader])

# Question-answering
question = DEFAULT_QUESTION
llm = GPT4All(model="/home/ubuntu/Downloads/orca-mini-3b.ggmlv3.q4_0.bin",max_tokens=2048)
index.query(question, llm=llm)


Found model file at  /home/ubuntu/Downloads/orca-mini-3b.ggmlv3.q4_0.bin


llama.cpp: loading model from /home/ubuntu/Downloads/orca-mini-3b.ggmlv3.q4_0.bin
llama_model_load_internal: format     = ggjt v3 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 2048
llama_model_load_internal: n_embd     = 3200
llama_model_load_internal: n_mult     = 240
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_layer    = 26
llama_model_load_internal: n_rot      = 100
llama_model_load_internal: ftype      = 2 (mostly Q4_0)
llama_model_load_internal: n_ff       = 8640
llama_model_load_internal: n_parts    = 1
llama_model_load_internal: model size = 3B
llama_model_load_internal: ggml ctx size =    0.06 MB
llama_model_load_internal: mem required  = 2862.72 MB (+  682.00 MB per state)
llama_new_context_with_model: kv self size  =  650.00 MB


' H1B refers to a nonimmigrant visa classification used by U.S. employers to hire foreign workers in specialty occupations, such as in the fields of science, engineering, and technology.'

## Explicit Chrome Store with QA Chain

In [None]:
# Document loader
from langchain.document_loaders import WebBaseLoader
loader = WebBaseLoader(SCRAPE_URL)
data = loader.load()
data

In [None]:
# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 0)
all_splits = text_splitter.split_documents(data)
all_splits

In [15]:
# Store 
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

import chromadb
chroma_client = chromadb.Client()
collection = chroma_client.create_collection(name="my_collection")

chroma_vector_store = Chroma.from_documents(documents=all_splits,
                                            client=chroma_client,collection_name="my_collection",
                                    embedding=HuggingFaceEmbeddings(),persist_directory=".")

In [19]:
# Question-answering
question = DEFAULT_QUESTION
results = chroma_vector_store.similarity_search(query=question,k=4)
results[0].page_content

'automobile aficionados, owning the brand of car that you dreamed since childhood can be the biggest achievement of life (until then).This article is about Venkat^[1], a Google Engineer who wished to buy his Ferrari before he reaches the age of 40H1b VisaIf you work in US tech sector, you donâ€™t need introduction to H1b. It is one of the most sought after visa (among many others) that is given to a worker with high skills (as defined by US government). Basically the united states government'

### MMR
- maximum marginal relevance search where it optimizes for similarity to query AND diversity among selected documents.

In [20]:
# Question-answering
question = DEFAULT_QUESTION
results = chroma_vector_store.max_marginal_relevance_search(query=question,k=4)
results[0].page_content

'automobile aficionados, owning the brand of car that you dreamed since childhood can be the biggest achievement of life (until then).This article is about Venkat^[1], a Google Engineer who wished to buy his Ferrari before he reaches the age of 40H1b VisaIf you work in US tech sector, you donâ€™t need introduction to H1b. It is one of the most sought after visa (among many others) that is given to a worker with high skills (as defined by US government). Basically the united states government'

## QA Chain

In [62]:
from langchain.chains.question_answering import load_qa_chain
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import GPT4All

llm = GPT4All(model="/home/ubuntu/Downloads/orca-mini-3b.ggmlv3.q4_0.bin",max_tokens=2048)
chain = load_qa_chain(llm, chain_type="stuff")

response = chain.run(input_documents=results,question=question)

Found model file at  /home/ubuntu/Downloads/orca-mini-3b.ggmlv3.q4_0.bin


llama.cpp: loading model from /home/ubuntu/Downloads/orca-mini-3b.ggmlv3.q4_0.bin
llama_model_load_internal: format     = ggjt v3 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 2048
llama_model_load_internal: n_embd     = 3200
llama_model_load_internal: n_mult     = 240
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_layer    = 26
llama_model_load_internal: n_rot      = 100
llama_model_load_internal: ftype      = 2 (mostly Q4_0)
llama_model_load_internal: n_ff       = 8640
llama_model_load_internal: n_parts    = 1
llama_model_load_internal: model size = 3B
llama_model_load_internal: ggml ctx size =    0.06 MB
llama_model_load_internal: mem required  = 2862.72 MB (+  682.00 MB per state)
llama_new_context_with_model: kv self size  =  650.00 MB


In [63]:
response

" H1b is a type of visa given to skilled workers from other countries who work in the US tech sector. It allows them to live and work in the country for up to six years, with the option to renew their visas multiple times. The program was established in 2000 as part of the government's efforts to attract and retain highly skilled workers."

# Use FAISS with QA Chain

In [None]:
# Document loader
from langchain.document_loaders import WebBaseLoader
loader = WebBaseLoader(SCRAPE_URL)
data = loader.load()

# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 0)
# Since FAISS requires splits from text and not Lang Document type, we do the below
all_splits = text_splitter.split_text(text=data[0].page_content)
all_splits

In [7]:
# FAISS stores in RAM
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

# embeddings = HuggingFaceEmbeddings()
embeddings = LocalHuggingFaceEmbeddings('multi-qa-mpnet-base-dot-v1')
faiss_vector_store = FAISS.from_texts(all_splits, embedding=embeddings)

## Embeddings created locally
- When HuggingFaceEmbeddings() is called, a network calls happens
- We can reduce this by locally creating the embeddings , hence LocalHuggingFaceEmbeddings

In [12]:
# Question-answering
question = DEFAULT_QUESTION
results = faiss_vector_store.similarity_search(query=question,k=4)
results[0].page_content

'automobile aficionados, owning the brand of car that you dreamed since childhood can be the biggest achievement of life (until then).This article is about Venkat^[1], a Google Engineer who wished to buy his Ferrari before he reaches the age of 40H1b VisaIf you work in US tech sector, you donâ€™t need introduction to H1b. It is one of the most sought after visa (among many others) that is given to a worker with high skills (as defined by US government). Basically the united states government'

In [67]:
from langchain.chains.question_answering import load_qa_chain
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import GPT4All

llm = GPT4All(model="/home/ubuntu/Downloads/orca-mini-3b.ggmlv3.q4_0.bin",max_tokens=2048)
chain = load_qa_chain(llm, chain_type="stuff")

response = chain.run(input_documents=results,question=question)

Found model file at  /home/ubuntu/Downloads/orca-mini-3b.ggmlv3.q4_0.bin


llama.cpp: loading model from /home/ubuntu/Downloads/orca-mini-3b.ggmlv3.q4_0.bin
llama_model_load_internal: format     = ggjt v3 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 2048
llama_model_load_internal: n_embd     = 3200
llama_model_load_internal: n_mult     = 240
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_layer    = 26
llama_model_load_internal: n_rot      = 100
llama_model_load_internal: ftype      = 2 (mostly Q4_0)
llama_model_load_internal: n_ff       = 8640
llama_model_load_internal: n_parts    = 1
llama_model_load_internal: model size = 3B
llama_model_load_internal: ggml ctx size =    0.06 MB
llama_model_load_internal: mem required  = 2862.72 MB (+  682.00 MB per state)
llama_new_context_with_model: kv self size  =  650.00 MB


In [68]:
response

" H1b is a type of visa given to skilled workers from other countries who work in the US tech sector. It allows them to live and work in the country for up to six years, with the option to renew their visas multiple times. The program was established in 2000 as part of the government's efforts to attract and retain highly skilled workers."

## Ask questions over lengthy documents
- US Government funding program for AI etc.
- https://www.eda.gov/sites/default/files/2023-05/Tech_Hubs_NOFO.pdf

In [69]:
!pip3 install PyPDF2==3.0.1

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [70]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from PyPDF2 import PdfReader

pdf_reader = PdfReader(SAMPLE_PDF_DOCUMENT)
text = ""
for page in pdf_reader.pages:
    text += page.extract_text()

# Split
text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1500,
            chunk_overlap=200,
            length_function=len
        )
# Since FAISS requires splits from text and not Lang Document type, we do the below
all_splits = text_splitter.split_text(text=text)

In [75]:
# FAISS stores in RAM
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings()
faiss_vector_store = FAISS.from_texts(all_splits, embedding=HuggingFaceEmbeddings())

In [76]:
# Question-answering
question = "What is tech hubs program?"
results = faiss_vector_store.similarity_search(query=question,k=4)
results[0].page_content

'2 The National Science Foundation is required to review and update this list annually. For the purposes of both \nPhases 1 and 2 of the Tech Hubs Competition, EDA is relying on the initial list enacte d by Congress at 42 U.S.C. § \n19107(c).   \nPage 6 of 37 significant technological strength from the above list as opposed to more nascent or less resourced \ntechnology area s. \n \nIllustrations of potential relationships among Key Technology Focus Areas (KFTAs) and a coalition’s \nselected core technology areas  \nNote that the Tech Hubs program is not intended to fund basic and fundamental research nor activities \nintended to increase capacity to conduct such research ; the National Science Foundation and other \nagencies fund such activities . Instead , the Tech Hubs program is intended to advance the capacities of \nplaces to commercializ e, deploy,  and domestic ally manufactur e and d eliver these technologies. All \nprojects funded under both phases of the Tech Hubs Program sh

In [77]:
from langchain.chains.question_answering import load_qa_chain
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import GPT4All

llm = GPT4All(model="/home/ubuntu/Downloads/orca-mini-3b.ggmlv3.q4_0.bin",max_tokens=2048)
chain = load_qa_chain(llm, chain_type="stuff")

response = chain.run(input_documents=results,question=question)
response

Found model file at  /home/ubuntu/Downloads/orca-mini-3b.ggmlv3.q4_0.bin


llama.cpp: loading model from /home/ubuntu/Downloads/orca-mini-3b.ggmlv3.q4_0.bin
llama_model_load_internal: format     = ggjt v3 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 2048
llama_model_load_internal: n_embd     = 3200
llama_model_load_internal: n_mult     = 240
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_layer    = 26
llama_model_load_internal: n_rot      = 100
llama_model_load_internal: ftype      = 2 (mostly Q4_0)
llama_model_load_internal: n_ff       = 8640
llama_model_load_internal: n_parts    = 1
llama_model_load_internal: model size = 3B
llama_model_load_internal: ggml ctx size =    0.06 MB
llama_model_load_internal: mem required  = 2862.72 MB (+  682.00 MB per state)
llama_new_context_with_model: kv self size  =  650.00 MB


' The Tech Hubs Program is a U.S. government competition that aims to identify regions where an investment can catalyze self-sustaining, globally-competitive regions over the next decade, with each focused on a key technology focus area. To achieve this goal and fulfill statutory direction, EDA will run this competition with a focus on geographic diversity and equity in two phases through two separate Notices of Funding Opportunity (NOFO). In this Phase 1 NOFO, EDA will fund Strategy Development Grants and will Designate certain regions as Regional Technology and Innovation Hubs (Tech Hubs) . Applicants to this Phase 1 NOFO must choose whether they are pursuing a Strategy Development Grant, a Tech Hub, or both.'

# LLMChain

In [78]:
from langchain import PromptTemplate, LLMChain
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import GPT4All

llm = GPT4All(model="/home/ubuntu/Downloads/orca-mini-3b.ggmlv3.q4_0.bin",max_tokens=2048)

# Prompt
prompt = PromptTemplate.from_template(
    "Summarize the main themes in these retrieved docs: {docs}"
)

# Chain
llm_chain = LLMChain(llm=llm, prompt=prompt)

# Run
question = "What is tech hubs program?"
docs = faiss_vector_store.similarity_search(question)
result = llm_chain(docs)

# Output
result["text"]

Found model file at  /home/ubuntu/Downloads/orca-mini-3b.ggmlv3.q4_0.bin


llama.cpp: loading model from /home/ubuntu/Downloads/orca-mini-3b.ggmlv3.q4_0.bin
llama_model_load_internal: format     = ggjt v3 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 2048
llama_model_load_internal: n_embd     = 3200
llama_model_load_internal: n_mult     = 240
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_layer    = 26
llama_model_load_internal: n_rot      = 100
llama_model_load_internal: ftype      = 2 (mostly Q4_0)
llama_model_load_internal: n_ff       = 8640
llama_model_load_internal: n_parts    = 1
llama_model_load_internal: model size = 3B
llama_model_load_internal: ggml ctx size =    0.06 MB
llama_model_load_internal: mem required  = 2862.72 MB (+  682.00 MB per state)
llama_new_context_with_model: kv self size  =  650.00 MB


" Document(page_content='Designation Grant, or both. For Strategy Development Grants, applicants must propose a plan for developing a strategy for their Tech Hub that addresses the key technology focus area of their region and demonstrates how the proposed activities will catalyze self-sustaining economic development in the region over the next decade. For Designation Grants, applicants must identify regions that meet the criteria for geographic diversity and equity in order to be designated as a Tech Hub . 16 See https://www.commerce.gov/issues/workforce -development .', metadata={})] Document(page_content='these regions. The Designated Tech Hubs will receive additional funding and support to implement their strategies over the next decade, including through Implementation Grants awarded in Phase 2 of the program. 17 See https://www.commerce.gov/funding -opportunities/regional-technology-innovation-hubs .', metadata={})] Document(page_content='The Tech Hubs Program is designed to meet

In [82]:
result["docs"][0].page_content

'2 The National Science Foundation is required to review and update this list annually. For the purposes of both \nPhases 1 and 2 of the Tech Hubs Competition, EDA is relying on the initial list enacte d by Congress at 42 U.S.C. § \n19107(c).   \nPage 6 of 37 significant technological strength from the above list as opposed to more nascent or less resourced \ntechnology area s. \n \nIllustrations of potential relationships among Key Technology Focus Areas (KFTAs) and a coalition’s \nselected core technology areas  \nNote that the Tech Hubs program is not intended to fund basic and fundamental research nor activities \nintended to increase capacity to conduct such research ; the National Science Foundation and other \nagencies fund such activities . Instead , the Tech Hubs program is intended to advance the capacities of \nplaces to commercializ e, deploy,  and domestic ally manufactur e and d eliver these technologies. All \nprojects funded under both phases of the Tech Hubs Program sh

# Retrieval QA
- The most abstracted one

In [87]:
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import GPT4All
from langchain import PromptTemplate, LLMChain

llm = GPT4All(model="/home/ubuntu/Downloads/orca-mini-3b.ggmlv3.q4_0.bin",max_tokens=2048)

# Prompt
template = """Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know, don't try to make up an answer. 
Use three sentences maximum and keep the answer as concise as possible. 
Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate(
    input_variables=["context", "question"],
    template=template,
)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=faiss_vector_store.as_retriever(),
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT},
)

context = faiss_vector_store.similarity_search(question)

response = qa_chain({"context":context,"query": question})

Found model file at  /home/ubuntu/Downloads/orca-mini-3b.ggmlv3.q4_0.bin


llama.cpp: loading model from /home/ubuntu/Downloads/orca-mini-3b.ggmlv3.q4_0.bin
llama_model_load_internal: format     = ggjt v3 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 2048
llama_model_load_internal: n_embd     = 3200
llama_model_load_internal: n_mult     = 240
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_layer    = 26
llama_model_load_internal: n_rot      = 100
llama_model_load_internal: ftype      = 2 (mostly Q4_0)
llama_model_load_internal: n_ff       = 8640
llama_model_load_internal: n_parts    = 1
llama_model_load_internal: model size = 3B
llama_model_load_internal: ggml ctx size =    0.06 MB
llama_model_load_internal: mem required  = 2862.72 MB (+  682.00 MB per state)
llama_new_context_with_model: kv self size  =  650.00 MB


In [89]:
response["result"]

' The Tech Hubs Program is a U.S. Economic Development Administration competition that aims to support regions in becoming globally competitive by investing in modernization of manufacturing and innovation in key technology focus areas. It will fund two phases of regional development, with Phase 1 focused on strategy development grants and designating regions as tech hubs (Tech Hubs), and Phase 2 focused on implementation grants for the designated regions.'