In [None]:
! pip install transformers faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [None]:
import os
import faiss
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from transformers import AutoTokenizer, AutoModelForMaskedLM

In [None]:
def setup_retriever(corpus, model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    # Encode the corpus using the pre-trained model
    corpus_embeddings = []
    for doc in corpus:
        inputs = tokenizer(doc, return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)
            # Access the last hidden state from the outputs
            embeddings = outputs[0].mean(dim=1).squeeze().numpy()  # outputs[0] gives you last_hidden_state

            # Check if embeddings are valid before appending
            if embeddings.ndim == 1:
                corpus_embeddings.append(embeddings)

    # Convert list to NumPy array
    corpus_embeddings = np.array(corpus_embeddings)

    # Check the shape of corpus_embeddings before proceeding
    if corpus_embeddings.shape[0] == 0:
        raise ValueError("No embeddings were generated. Check the input corpus.")

    # Build the Faiss index
    index = faiss.IndexFlatIP(corpus_embeddings.shape[1])  # Use the correct dimension
    index = faiss.IndexIDMap(index)
    index.add_with_ids(corpus_embeddings, np.arange(len(corpus_embeddings)))

    return index, tokenizer, model



def retrieve_documents(query, index, tokenizer, model, k=5):
    inputs = tokenizer(query, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
        query_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

    # Search the Faiss index for the top-k relevant documents
    scores, doc_ids = index.search(np.array([query_embedding]), k)
    return [corpus[doc_id] for doc_id in doc_ids[0]]





In [None]:
# Set up the generative language model
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

def setup_generator(model_name):

    tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
    model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base")
    return tokenizer, model

def generate_response(query, documents, tokenizer, model):
    if isinstance(documents, list) and documents:  # Check if documents is a non-empty list
        documents = " ".join(documents)  # Join list into a single string
    elif not documents:  # If documents is empty
        return "No relevant documents found."
    input_text = f"question: {query} context: {' '.join(documents)}"
    inputs = tokenizer(query, documents, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=512, num_beams=4, early_stopping=True)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]



# Example usage
corpus = [
    "Since it was built and opened to the public in 1889, the Eiffel Tower instantly gained an international fame, as it was then the tallest building in the world. Its peculiar iron silhouette instantly traveled across the world in the newspapers.",
    "The Louvre is the world's largest art museum and a historic monument in Paris, France.",
    "The Statue of Liberty is a colossal neoclassical sculpture on Liberty Island in New York Harbor.",
]

# Set up the retriever
index, tokenizer, model = setup_retriever(corpus, "bert-base-uncased")

# Set up the generator
gen_tokenizer, gen_model = setup_generator("t5-base")

# Example query
query = "Tell me about Eiffel Tower"

# Retrieve relevant documents
documents = retrieve_documents(query, index, tokenizer, model)
print("Retrieved Documents:", documents)

# Generate the response
if documents:
    response = generate_response(query, documents, gen_tokenizer, gen_model,)
    print(response)
else:
    print("No relevant documents found.")


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Model Output Tokens: BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.1954,  0.0347, -0.2327,  ..., -0.2592,  0.2305,  0.4515],
         [ 0.6018, -0.2306, -0.2690,  ...,  0.2318,  0.6845,  0.0312],
         [-0.3186, -1.0884,  0.0023,  ...,  0.2784, -0.2035,  0.3597],
         ...,
         [ 0.6715,  0.2540, -0.1234,  ..., -0.5904, -0.3914,  0.0875],
         [ 0.2941, -0.4231, -1.0190,  ...,  0.6464,  0.1077,  0.0045],
         [ 1.0575,  0.0672, -0.3260,  ...,  0.1318, -0.6374, -0.1919]]]), pooler_output=tensor([[-8.8834e-01, -3.0073e-01, -1.1459e-02,  6.4608e-01, -1.5759e-02,
          2.1981e-03,  7.9231e-01,  2.6701e-01,  7.7451e-02, -9.9995e-01,
         -1.7979e-01,  4.8561e-01,  9.8979e-01, -1.1819e-01,  9.5052e-01,
         -5.4916e-01,  1.2901e-01, -5.9009e-01,  2.3211e-01, -3.5158e-01,
          6.3564e-01,  9.9492e-01,  4.6591e-01,  3.2373e-01,  4.0483e-01,
          8.2039e-01, -6.1613e-01,  9.5330e-01,  9.5860e-01,  7.4825e-01,
         -6.297