### Installing libraries

In [1]:
!pip install langchain
!pip install sentence_transformers
!pip install einops
!pip install unstructured
!pip install python-magic-bin
!pip install chromadb
!pip install accelerate

Collecting langchain
  Downloading langchain-0.0.231-py3-none-any.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.6.0,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.5.9-py3-none-any.whl (26 kB)
Collecting langchainplus-sdk<0.0.21,>=0.0.20 (from langchain)
  Downloading langchainplus_sdk-0.0.20-py3-none-any.whl (25 kB)
Collecting openapi-schema-pydantic<2.0,>=1.2 (from langchain)
  Downloading openapi_schema_pydantic-1.2.4-py3-none-any.whl (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.0/90.0 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
Collecting marshmallow<4.0.0,>=3.3.0 (from dataclasses-json<0.6.0,>=0.5.7->langchain)
  Downloading marshmallow-3.19.0-py3-none-any.whl (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.1/49.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting marshmallow-enum<2.0.0

### Importing libraries

In [2]:
from langchain.vectorstores import FAISS, Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import DirectoryLoader, DataFrameLoader

import os
import nltk
import re
import pandas as pd


#### Loading pan_card_services.txt knowledge document

In [6]:
# Get your loader ready
loader = DirectoryLoader('/', glob='**/*.txt')

documents = loader.load()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


#### Preprocessing and Creating Chunks

In [20]:
final_chunks = []
len(documents[0].dict()['page_content'].split("###"))
topic_list = documents[0].dict()['page_content'].split("###")
for topic in topic_list:
  topic = str(re.sub(' +', ' ', topic))
  topic = str(re.sub('\n', '', topic))
  topic = topic.replace('*', '')
  # print(len(topic))
  if len(topic)>800 and len(topic)<1000:
    final_chunks.append(topic[:500])
    final_chunks.append(topic[500:])
  elif len(topic)>1000:
    final_chunks.append(topic[:500])
    final_chunks.append(topic[500:1000])
    final_chunks.append(topic[1000:])
  elif len(topic)<600:
    final_chunks.append(topic)

In [30]:
df = pd.DataFrame(final_chunks, columns=['text'])
df['metadata'] = 'KnowledgeDocument(pan_card_service.txt)'
docs = DataFrameLoader(df)
docs = docs.load()

In [31]:
# Get your text splitter ready
text_splitter = RecursiveCharacterTextSplitter(chunk_size=224, chunk_overlap=50)

In [32]:
# Split your documents into texts
texts = text_splitter.split_documents(docs)

In [33]:
len(texts), len(docs)

(98, 28)

### Loading HuggingFace Sentence transformer to creating embeddings

In [14]:
from langchain.embeddings import HuggingFaceEmbeddings, SentenceTransformerEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

#### Creating Chroma indexing to store texts and corresponding embedding in memory

In [34]:
# Get your docsearch ready
# docsearch = Chroma.from_documents(texts, embeddings)
docsearch = Chroma.from_documents(docs, embeddings)

### Authenticating HuggingFace with my key to use LLM

In [15]:


key = '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'


from getpass import getpass

HUGGINGFACEHUB_API_TOKEN = getpass()

# Copy paste the above key in below dialog box

··········


#### Loading the falcon 7b model from hugging face

In [35]:
import os

os.environ["HUGGINGFACEHUB_API_TOKEN"] = HUGGINGFACEHUB_API_TOKEN
from langchain import HuggingFaceHub

from langchain import PromptTemplate, LLMChain

repo_id = 'tiiuae/falcon-7b-instruct'
# repo_id = 'michaelfeil/ct2fast-falcon-7b-instruct'
# repo_id = 'tiiuae/falcon-7b'
# repo_id = 'mosaicml/mpt-7b-instruct'
# repo_id = 'deepset/roberta-base-squad2'
# repo_id = 'google/tapas-base-finetuned-wtq'
# repo_id = 'nvidia/megatron-bert-cased-345m'

llm = HuggingFaceHub(repo_id=repo_id, model_kwargs={"temperature": 0.2, "max_length": 10000})

# Create your Retriever
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever())

Enter your query

In [36]:
# query = "can i apply for pan card without adhar?"
query = "what is the fees of pan card?"
# query = "what are the documents required for the new pan card"
# query = "Can I apply for a PAN card if I am a non-resident Indian (NRI)?"

Run this cell to get the answers with prompt engineering included

In [39]:

retriever=docsearch.as_retriever()
ret_list = retriever.get_relevant_documents(query)
ret_str = ' '.join([x.dict()['page_content'] for x in ret_list])
prompt_query = 'given the context: \n' + ret_str + '\n' + 'Answer the following question: ' + query

print(prompt_query, '\n')
print('Answer of the LLM:')
qa.run(prompt_query)

given the context: 
 Charges for reprinting the PAN CardThe charges for reprinting the PAN Card are INR 2500 for e-pan, and INR 3700 for physical pan card.---## Linking PAN with Aadhaar cardABC can link your PAN card and Aadhaar card on your behalf.  Cost of new PAN cardThe PAN CARD Application through ABC costs Rs 2500 for E-PAN, and if you want it to be couriered, it will cost Rs 1200 extra for physical delivery to your address.e-PAN Card cost: INR 2500Physical PAN Card cost: INR 3700 # About Pan Card  What is Pan card?The PAN card is a unique ten-digit alphanumeric identification number that is issued by the Income Tax Department of India to track the tax-related transactions of individuals and entities. The PAN card is mandatory for any financial transaction in India, including opening a bank account, buying or selling property, and filing income tax returns.
Answer the following question: what is the fees of pan card? 

Answer of the LLM:


'\nThe fees for a new PAN card application are Rs. 2500 for e-PAN'