# Import the libraries, which we are going to use in this implementation.

In [1]:
from langchain.document_loaders import HuggingFaceDatasetLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import AutoTokenizer, pipeline
from langchain import HuggingFacePipeline
from langchain.chains import RetrievalQA

# Document Loading*  Using Hugging Face, load the data.


In [2]:
# Specify the dataset name and the column containing the content
dataset_name = "databricks/databricks-dolly-15k"
page_content_column = "context"  # or any other column you're interested in

# Create a loader instance
loader = HuggingFaceDatasetLoader(dataset_name, page_content_column)

# Load the data
data = loader.load()

# Display the first 15 entries
data[:2]

[Document(metadata={'instruction': 'When did Virgin Australia start operating?', 'response': 'Virgin Australia commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route.', 'category': 'closed_qa'}, page_content='"Virgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It suddenly found itself as a major airline in Australia\'s domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney."'),
 Document(metadata={'instruction': 'Which is a species of fish? Tope or Rope', 'response': 'Tope', 'category': 'classification'}, page_content='""')]

# Document Transformers* There are several “Text Splitters” in LangChain, you have to choose according to your choice.
* I chose “RecursiveCharacterTextSplitter”. This text splitter is recommended for generic text.
* It is parametrized by a list of characters.
* It tries to split the long texts recursively until the chunks are smaller enough.


In [3]:
# Create an instance of the RecursiveCharacterTextSplitter class with specific parameters.
# It splits text into chunks of 1000 characters each with a 150-character overlap.
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)

# 'data' holds the text you want to split, split the text into documents using the text splitter.
docs = text_splitter.split_documents(data)

In [4]:
docs[0]

Document(metadata={'instruction': 'When did Virgin Australia start operating?', 'response': 'Virgin Australia commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route.', 'category': 'closed_qa'}, page_content='"Virgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It suddenly found itself as a major airline in Australia\'s domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney."')

# Text Embedding* Embeddings capture the semantic meaning of the text which allows you to quickly and efficiently find other pieces of text which are similar.4
* The Embeddings class of LangChain is designed for interfacing with text embedding models.
* You can use any of them, but I have used here “HuggingFaceEmbeddings”.


In [5]:
# Define the path to the pre-trained model you want to use
modelPath = "sentence-transformers/all-MiniLM-l6-v2"

# Create a dictionary with model configuration options, specifying to use the CPU for computations
model_kwargs = {'device':'cpu'}

# Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
encode_kwargs = {'normalize_embeddings': False}

# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
embeddings = HuggingFaceEmbeddings(
    model_name=modelPath,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs, # Pass the model configuration options
    encode_kwargs=encode_kwargs # Pass the encoding options
)


  embeddings = HuggingFaceEmbeddings(


In [6]:
text = "This is a test document."
query_result = embeddings.embed_query(text)
query_result[:3]

[-0.03833850845694542, 0.1234646737575531, -0.028642946854233742]

# Vector Stores
* There is a need of databases so that we can store those embeddings and efficiently search them.
* Therefore, for storage and searching purpose, we need vector stores

In [None]:
db = FAISS.from_documents(docs, embeddings)

Now, search the question.

In [None]:
question = "What is cheesemaking?"
searchDocs = db.similarity_search(question)
print(searchDocs[0].page_content)

# Preparing the LLM Model* We can choose any model from hugging face, and start with a tokenizer to preprocess text and a question-answering model to provide answers based on input text and questions.


* I used Intel/dynamic_tinybert which is a fine-tuned model for the purpose of question-answering.

In [None]:
# Create a tokenizer object by loading the pretrained "Intel/dynamic_tinybert" tokenizer.
tokenizer = AutoTokenizer.from_pretrained("Intel/dynamic_tinybert")

# Create a question-answering model object by loading the pretrained "Intel/dynamic_tinybert" model.
model = AutoModelForQuestionAnswering.from_pretrained("Intel/dynamic_tinybert")

* Create a question-answering pipeline using your pre-trained model and tokenizer and then extend its functionality by creating a LangChain pipeline with additional model-specific arguments.

In [None]:
# Specify the model name you want to use
model_name = "Intel/dynamic_tinybert"

# Load the tokenizer associated with the specified model
tokenizer = AutoTokenizer.from_pretrained(model_name, padding=True, truncation=True, max_length=512)

# Define a question-answering pipeline using the model and tokenizer
question_answerer = pipeline(
    "question-answering", 
    model=model_name, 
    tokenizer=tokenizer,
    return_tensors='pt'
)

# Create an instance of the HuggingFacePipeline, which wraps the question-answering pipeline
# with additional model-specific arguments (temperature and max_length)
llm = HuggingFacePipeline(
    pipeline=question_answerer,
    model_kwargs={"temperature": 0.7, "max_length": 512},
)

# Retrievers
Once the data is in database, the LLM model is prepared, and the pipeline is created, we need to retrieve the data. A retriever is an interface that returns documents from the query.


It is not able to store the documents, only return or retrieves them. Basically, vector stores are the backbone of the retrievers. There are many retriever algorithms in LangChain.

In [None]:
# Create a retriever object from the 'db' using the 'as_retriever' method.
# This retriever is likely used for retrieving data or documents from the database.
retriever = db.as_retriever()

Searching relevant documents for the question:

In [None]:
docs = retriever.get_relevant_documents("What is Cheesemaking?")
print(docs[0].page_content)

# Retrieval QA Chain

* Now, we’re going to use a RetrievalQA chain to find the answer to a question.
* To do this, we prepared our LLM model with “temperature = 0.7" and “max_length = 512”.
* You can set your temperature whatever you desire.

In [None]:
# Create a retriever object from the 'db' with a search configuration where it retrieves up to 4 relevant splits/documents.
retriever = db.as_retriever(search_kwargs={"k": 4})

# Create a question-answering instance (qa) using the RetrievalQA class.
# It's configured with a language model (llm), a chain type "refine," the retriever we created, and an option to not return source documents.
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="refine", retriever=retriever, return_source_documents=False)

Finally, we call this QA chain with the question we want to ask.



In [None]:
from transformers import pipeline

# Define the question-answering pipeline
qa_pipeline = pipeline("question-answering")

# Provide the context and question
context = (
    "Thomas Jefferson (April 13, 1743 – July 4, 1826) was an American statesman, diplomat, lawyer, "
    "architect, philosopher, and Founding Father who served as the third president of the United States "
    "from 1801 to 1809. Among the Committee of Five charged by the Second Continental Congress with authoring "
    "the Declaration of Independence, Jefferson was the Declaration's primary author. Following the American "
    "Revolutionary War and prior to becoming the nation's third president in 1801, Jefferson was the first "
    "United States secretary of state under George Washington and then the nation's second vice president under John Adams."
)

question = "Who is Thomas Jefferson?"

# Run the QA pipeline
result = qa_pipeline({"context": context, "question": question})

# Print the answer
print(result["answer"])
