In [1]:
pip install transformers langchain datasets faiss-cpu

Collecting langchain
  Downloading langchain-0.2.15-py3-none-any.whl.metadata (7.1 kB)
Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.7 kB)
Collecting langchain-core<0.3.0,>=0.2.35 (from langchain)
  Downloading langchain_core-0.2.36-py3-none-any.whl.metadata (6.2 kB)
Collecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.2-py3-none-any.whl.metadata (2.1 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.107-py3-none-any.whl.metadata (13 kB)
Collecting tenacity!=8.4.0,<9.0.0,>=8.1.0 (from langchain)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  D

In [2]:
from datasets import load_dataset

# Load the WikiQA dataset
dataset = load_dataset("wiki_qa")

# We will focus on the training set for this example
train_data = dataset['train']


Downloading readme:   0%|          | 0.00/13.8k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/594k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/264k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.00M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/6165 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2733 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/20360 [00:00<?, ? examples/s]

In [3]:
# Extract questions and corresponding passages
documents = []
for example in train_data:
    # Check if 'passage_text' key exists before accessing it
    passage_text = example.get("passage_text", "")
    documents.append({
        "text": example["document_title"] + ": " + passage_text,
        "question": example["question"],
        "answer": example["label"]
    })

# Split the texts and prepare the document corpus for retrieval
passages = [doc["text"] for doc in documents]

In [4]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.0.1


In [5]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Load a pre-trained sentence transformer model for embeddings
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Create embeddings for all passages
passage_embeddings = embedding_model.encode(passages, convert_to_tensor=False)

# Convert embeddings to a numpy array
passage_embeddings = np.array(passage_embeddings)

# Create a FAISS index
index = faiss.IndexFlatL2(passage_embeddings.shape[1])
index.add(passage_embeddings)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
!pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.2.14-py3-none-any.whl.metadata (2.7 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.22.0-py3-none-any.whl.metadata (7.2 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading mypy_extensions-1.0.0-py3-none-any.whl.metadata (1.1 kB)
Downloading langchain_community-0.2.14-py3-none-any.whl (2.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dataclasses_json-0.6.7-py3-none-any.whl (

In [7]:
import os
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

In [8]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.in_memory import InMemoryDocstore
from langchain.chains import RetrievalQA
from langchain.schema import Document
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.chains.question_answering import load_qa_chain
from transformers import pipeline

# Embedder for document retrieval
embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Assuming 'passages' is a list of text
# Create documents from passages
docs = [Document(page_content=passage) for passage in passages]

embeddings = embedder.embed_documents(passages)

# Convert the list of embeddings into a NumPy array
embeddings_np = np.array(embeddings)

# Create a FAISS index using the shape of the NumPy array
index = faiss.IndexFlatL2(embeddings_np.shape[1])
index.add(embeddings_np)

# Create InMemoryDocstore from documents
# Use the same IDs as in FAISS index
docstore = InMemoryDocstore({str(i): doc for i, doc in enumerate(docs)})

# Create a FAISS-backed vector store
vectorstore = FAISS(embedding_function=embedder.embed_query, # Use embed_query for querying
                     index=index,
                     docstore=docstore,
                     index_to_docstore_id={i: str(i) for i in range(len(docs))}  # Provide the mapping
                    )

# Setup the retriever
retriever = vectorstore.as_retriever()

# Load a Hugging Face model using transformers' pipeline
# Increase max_length to handle longer inputs
hf_pipeline = pipeline("text-generation", model="EleutherAI/gpt-neo-125m", temperature=0.7, max_length=100)
llm = HuggingFacePipeline(pipeline=hf_pipeline)

# Define a prompt template
prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template="Context: {context}\n\nQuestion: {question}\n\nAnswer:"
)

# Setup the Retrieval-Augmented Generation chain using the updated API
qa_chain = load_qa_chain(llm, chain_type="stuff", prompt=prompt_template)

# Integrate the retriever with the QA chain
rag = RetrievalQA(combine_documents_chain=qa_chain, retriever=retriever)

# Use the RAG model
result = rag({"query": "Your question here"})
print(result)

  embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/526M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

  llm = HuggingFacePipeline(pipeline=hf_pipeline)
stuff: https://python.langchain.com/v0.2/docs/versions/migrating_chains/stuff_docs_chain
map_reduce: https://python.langchain.com/v0.2/docs/versions/migrating_chains/map_reduce_chain
refine: https://python.langchain.com/v0.2/docs/versions/migrating_chains/refine_chain
map_rerank: https://python.langchain.com/v0.2/docs/versions/migrating_chains/map_rerank_docs_chain

See also guides on retrieval and question-answering here: https://python.langchain.com/v0.2/docs/how_to/#qa-with-rag
  qa_chain = load_qa_chain(llm, chain_type="stuff", prompt=prompt_template)
  rag = RetrievalQA(combine_documents_chain=qa_chain, retriever=retriever)
  result = rag({"query": "Your question here"})
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with 

{'query': 'Your question here', 'result': 'Context: What You Need: \n\nWhat You Need: \n\nWhat You Need: \n\nThe Trial: \n\nQuestion: Your question here\n\nAnswer: \n\nWhat You Need: \n\nWhat You Need: \n\nWhat You Need: \n\nThe Trial: \n\nQuestion: Your question here\n\nAnswer: \n\nWhat You Need: \n\nWhat You Need: \n\nThe Trial: \n\n'}


In [9]:
# Example query
query = "What is RAG?"

# Run the RAG model
result = rag({"query": query})  # Pass the query as a dictionary with the 'query' key
print(result)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


{'query': 'What is RAG?', 'result': 'Context: Antibody: \n\nAntibody: \n\nAntibody: \n\nAntibody: \n\nQuestion: What is RAG?\n\nAnswer: RAG is a protein that is a part of the RAG family of proteins. RAG is a protein that is a part of the RAG family of proteins. RAG is a protein that is a part of the RAG family of proteins.\n\nRAG is a protein that'}


In [10]:
# Example query
query = "What is Machine Learning?"

# Run the RAG model
result = rag({"query": query})  # Pass the query as a dictionary with the 'query' key
print(result)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


{'query': 'What is Machine Learning?', 'result': 'Context: Biological classification: \n\nBiological classification: \n\nBiological classification: \n\nBiological classification: \n\nQuestion: What is Machine Learning?\n\nAnswer: Machine Learning is a method of classification that is based on the classification of a set of data.\n\nA:\n\nThe problem is that you are trying to classify a set of data, not a single data set. \nThe problem is that you are trying to classify a set of data'}
