In [7]:
!pip install transformers torch numpy openai



In [8]:
from transformers import AutoTokenizer, AutoModel
import numpy as np
import os
import re
import uuid
import torch
import json

In [9]:
directory_path = "/content/documents"
os.makedirs(directory_path, exist_ok=True)


In [10]:
def load_documents(directory_path):
    documents = {}  # Dictionary to store document ID and text content

    for filename in os.listdir(directory_path):
        file_path = os.path.join(directory_path, filename)
        if os.path.isfile(file_path):
            # Generate unique document ID
            doc_id = str(uuid.uuid4())

            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()

            documents[doc_id] = {"text": text, "metadata": {"file_name": filename}}

    return documents

# Load documents and print to check the content
documents = load_documents(directory_path)
print("Loaded Documents:", documents)




In [11]:
def chunk_text(documents, model_name="bert-base-uncased", chunk_size=200):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    all_chunks = {}  # Dictionary to store all chunks with document and chunk IDs

    for doc_id, doc_data in documents.items():
        text = doc_data["text"]
        paragraphs = re.split("\n\n", text)  # Split text by paragraphs

        for paragraph in paragraphs:
            words = paragraph.split()  # Split paragraph into words
            current_chunk = ""
            chunks = []

            # Iterate over words to create chunks
            for word in words:
                new_chunk = current_chunk + " " + word if current_chunk else word
                # Check if the chunk size exceeds the limit
                if len(tokenizer.tokenize(new_chunk)) <= chunk_size:
                    current_chunk = new_chunk
                else:
                    chunks.append(current_chunk)
                    current_chunk = word

            if current_chunk:
                chunks.append(current_chunk)

            # Assign a unique ID to each chunk
            for chunk_text in chunks:
                chunk_id = str(uuid.uuid4())
                all_chunks[chunk_id] = {
                    "text": chunk_text,
                    "metadata": {"doc_id": doc_id, "file_name": doc_data["metadata"]["file_name"]}
                }

    return all_chunks

# Apply chunking function to documents and print a sample to verify
all_chunks = chunk_text(documents)
print("Sample Chunk:", list(all_chunks.items())[:1])  # Print a sample chunk to check




import numpy as np
import torch

def generate_embeddings(data, model, tokenizer):
    embeddings = {}

    for chunk_id, chunk_data in data.items():
        # Tokenize the text
        inputs = tokenizer(chunk_data["text"], return_tensors="pt", padding=True, truncation=True)

        # Generate embeddings
        with torch.no_grad():
            output = model(**inputs)
            # Assuming we want the mean of the last hidden state
            embedding = output.last_hidden_state.mean(dim=1).squeeze().numpy()  # Convert to numpy array

        embeddings[chunk_id] = {
            "embedding": embedding,
            "metadata": chunk_data["metadata"],
            "text": chunk_data["text"]  # Store the original text if needed
        }

    return embeddings




In [12]:
# Generate embeddings for all chunks and print a sample to verify
model_name = 'bert-base-uncased'  # Example model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
embeddings = generate_embeddings(all_chunks, model, tokenizer)

print("Embedding Shape:", embeddings[list(embeddings.keys())[0]]['embedding'].shape)


Embedding Shape: (768,)


In [13]:
import json

def save_json(file_path, data):
    """
    Save a dictionary to a JSON file.

    Parameters:
        file_path (str): Path to save the JSON file.
        data (dict): Dictionary to be saved.
    """
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

# Prepare documents and their embeddings for saving
documents = {}  # To store text chunks
mapped_document_db = {}  # To store embeddings

# Populate the dictionaries
for chunk_id, chunk_data in all_chunks.items():  # Iterate through your chunks
    text = chunk_data['text']  # Assuming each chunk has a 'text' key
    embedding = embeddings[chunk_id]['embedding'].tolist()  # Convert to list if necessary

    # Populate documents with text
    documents[chunk_id] = text

    # Populate the vector store with embeddings
    mapped_document_db[chunk_id] = embedding

# Save the documents and vector store to JSON files
save_json('doc_store.json', documents)
save_json('vector_store.json', mapped_document_db)

print("Document and vector stores saved to JSON files.")


Document and vector stores saved to JSON files.


In [14]:
# Print the structure of the embeddings to check for keys
for chunk_id, chunk_data in embeddings.items():
    print(f"Chunk ID: {chunk_id}")
    print(f"Embedding Shape: {chunk_data['embedding'].shape}")  # Shape of the embedding
    print(f"Metadata: {chunk_data['metadata']}")
    print(f"Text: {chunk_data['text']}\n")  # Print the text if available


Chunk ID: 3f7a024e-6906-4a97-b5ca-043fb744a81f
Embedding Shape: (768,)
Metadata: {'doc_id': '001adec2-fc9b-418d-9d4c-1afa7e5706b2', 'file_name': 'squad_samples_New_Delhi.txt'}

Chunk ID: 3b0b8e07-5c7b-4780-8575-d75a143a4ad2
Embedding Shape: (768,)
Metadata: {'doc_id': '001adec2-fc9b-418d-9d4c-1afa7e5706b2', 'file_name': 'squad_samples_New_Delhi.txt'}
Text: Sample 1 ID: 5706899e75f01819005e7be8 Question: The Delhi Flying Club was established in what year? Context: The Delhi Flying Club, established in 1928 with two de Havilland Moth aircraft named Delhi and Roshanara, was based at Safdarjung Airport which started operations in 1929, when it was the Delhi's only airport and the second in India. The airport functioned until 2001, however in January 2002 the government closed the airport for flying activities because of security concerns following the New York attacks in September 2001. Since then, the club only carries out aircraft maintenance courses, and is used for helicopter rides to 

In [15]:
from sklearn.metrics.pairwise import cosine_similarity

def retrieve_top_k(query_embedding, embeddings, k=5):
    # Calculate cosine similarity between the query and chunk embeddings
    similarities = {}
    for chunk_id, chunk_data in embeddings.items():
        similarity = cosine_similarity([query_embedding], [chunk_data['embedding']])[0][0]
        similarities[chunk_id] = similarity

    # Sort by similarity and return top k results
    top_k = sorted(similarities.items(), key=lambda item: item[1], reverse=True)[:k]
    return top_k


In [16]:
def sparse_retrieve_top_k(query, documents, k=5):
    # Extract text from documents
    if isinstance(next(iter(documents.values())), str):  # Check if document values are plain strings
        doc_texts = list(documents.values())
    else:  # If document values are dictionaries, assume they have a 'text' key
        doc_texts = [data['text'] for data in documents.values()]

    doc_ids = list(documents.keys())

    # Convert the query to lowercase to ensure case-insensitive comparison
    query_words = set(query.lower().split())

    # Use cosine similarity for text similarity
    similarities = {}
    max_similarity = len(query_words)  # The maximum similarity score is the size of the query set

    for doc_id, text in zip(doc_ids, doc_texts):
        # Convert document text to lowercase and split into words
        doc_words = set(text.lower().split())

        # Compute the number of shared words (simple overlap measure)
        similarity = len(query_words & doc_words)  # Using set intersection for faster matching

        # Normalize the similarity score to the range 0-1
        normalized_similarity = similarity / max_similarity if max_similarity > 0 else 0
        similarities[doc_id] = normalized_similarity

    # Sort by normalized similarity and return top k results
    top_k = sorted(similarities.items(), key=lambda item: item[1], reverse=True)[:k]
    return top_k


In [17]:
def fuse_dense_sparse(dense_results, sparse_results, alpha=0.5, threshold=0.7):
    """
    Fuse dense and sparse results and return those with fused score above a given threshold.

    Parameters:
    - dense_results: List of tuples containing (chunk_id, dense_similarity).
    - sparse_results: List of tuples containing (chunk_id, sparse_similarity).
    - alpha: The weight for the dense results in the fused score (default is 0.5).
    - threshold: The minimum fused score required for a result to be included (default is 0.0).

    Returns:
    - filtered_fused_results: List of tuples with filtered fused results where the score >= threshold.
    """
    # Create dictionaries for quick lookup
    dense_dict = dict(dense_results)  # {chunk_id: similarity}
    sparse_dict = dict(sparse_results)  # {chunk_id: similarity}

    # Combine all keys
    all_keys = set(dense_dict.keys()).union(sparse_dict.keys())

    fused_scores = {}
    for key in all_keys:
        dense_score = dense_dict.get(key, 0)  # Default to 0 if key not found
        sparse_score = sparse_dict.get(key, 0)

        # Weighted sum
        fused_score = alpha * dense_score + (1 - alpha) * sparse_score
        fused_scores[key] = fused_score

    # Sort by fused scores in descending order
    fused_results = sorted(fused_scores.items(), key=lambda item: item[1], reverse=True)

    # Filter the results based on the threshold
    filtered_fused_results = [
        (chunk_id, score) for chunk_id, score in fused_results if score >= threshold
    ]

    return filtered_fused_results


In [18]:
# Example query text
query = "What is the name of the last residential area built by the British Raj in Delhi?"

# Tokenize and generate the query embedding
query_inputs = tokenizer(query, return_tensors="pt", padding=True, truncation=True)
with torch.no_grad():
    query_embeddings = model(**query_inputs).last_hidden_state.mean(dim=1).squeeze()  # Get mean of last hidden state
query_embeddings = query_embeddings.numpy()  # Convert to numpy array
normalized_query = query_embeddings / np.linalg.norm(query_embeddings)  # Normalize the query embedding


In [19]:
# Perform dense retrieval
top_dense_results = retrieve_top_k(normalized_query, embeddings, k=10)

# Perform sparse retrieval
top_sparse_results = sparse_retrieve_top_k(query, documents, k=10)

# Fuse dense and sparse results
fused_results = fuse_dense_sparse(top_dense_results, top_sparse_results, alpha=1,threshold=0.7)

# Print fused results
print("Fused Top Results:")
for chunk_id, score in fused_results:
    chunk_text = documents[chunk_id]
    print(f"Chunk ID: {chunk_id}, Fused Score: {score:.4f}, Text: {chunk_text}")


Fused Top Results:
Chunk ID: 320416d8-a5b6-427f-b5a9-81e86db7cf6a, Fused Score: 0.8321, Text: Sample 99 ID: 5705e82775f01819005e7741 Question: What was the name of the last residential area built by the British Raj? Context: After the capital of India moved to Delhi, a temporary secretariat building was constructed in a few months in 1912 in North Delhi. Most of the government offices of the new capital moved here from the 'Old secretariat' in Old Delhi (the building now houses the Delhi Legislative Assembly), a decade before the new capital was inaugurated in 1931. Many employees were brought into the new capital from distant parts of India, including the Bengal Presidency and Madras Presidency. Subsequently housing for them was developed around Gole Market area in the 1920s. Built in the 1940s, to house government employees, with bungalows for senior officials in the nearby Lodhi Estate area, Lodhi colony near historic Lodhi Gardens, was the last residential areas built by the Britis

In [20]:
import openai
from transformers import pipeline


first_match = fused_results[0]  # You can choose a different match if needed
chunk_id = first_match[0]  # Chunk ID of the most similar document
relevant_text = documents[chunk_id]

# Load the GPT-2 model
generator = pipeline('text-generation', model='gpt2',pad_token_id=50256)

def generate_llm_response(query, relevant_text):
    try:
        # Construct the prompt with both relevant text and the query
        prompt = f"Given the following relevant information: {relevant_text}\n\nQuestion: {query}\nAnswer:"

        # Generate text with a bit more room for the response
        response = generator(prompt, max_new_tokens=100, num_return_sequences=1, truncation=True)

        # Extract the answer from the generated text
        generated_text = response[0]['generated_text'].strip()

        # Assuming the answer is the text after the "Answer:" label
        answer = generated_text.split('Answer:')[1].strip() if 'Answer:' in generated_text else generated_text
        return answer

    except Exception as e:
        print(f"Error generating response: {e}")
        return None

response = generate_llm_response(query, relevant_text)
print("Generated Response:", response)



config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Generated Response: The name has had many variations, from "Home" to "Home for Old Age" in English. For example, it can mean both: "home for old age" in English. Most of those who built it as a city, were foreigners. Some of the construction work was carried on by women who were paid to do construction work on other city projects. The following is from the 'Old secretariat' history.

Question: What is the name of the last residential area, located


In [21]:
import openai
from transformers import pipeline

first_match = fused_results[0]  # You can choose a different match if needed
chunk_id = first_match[0]  # Chunk ID of the most similar document
relevant_text = documents[chunk_id]

generator = pipeline('text-generation', model='gpt2', pad_token_id=50256)

def generate_llm_response(query, relevant_text, temperature=0.7, top_k=50, max_tokens=150):
    try:
        # Construct the prompt with both relevant text and the query
        prompt = f"Given the following relevant information: {relevant_text}\n\nQuestion: {query}\nAnswer:"

        # Generate text with adjusted parameters
        response = generator(prompt, max_new_tokens=max_tokens, num_return_sequences=1,
                             temperature=temperature, top_k=top_k, truncation=True)

        # Extract the generated text and clean it up
        generated_text = response[0]['generated_text'].strip()

        # Assuming the answer starts after the "Answer:" label
        answer = generated_text.split('Answer:')[1].strip() if 'Answer:' in generated_text else generated_text
        return answer

    except Exception as e:
        print(f"Error generating response: {e}")
        return None


response = generate_llm_response(query, relevant_text)
print("Generated Response:", response)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Generated Response: The last residential area (Lamla Mandar) was built in the early 1930s in the Bijou. This building was converted to a residential building in 1942, at which time it was occupied by the Raj (later renamed the Army of India) before being demolished. The main building of this building, the former Lodhi Colony (later the Srinagar Colony), has been demolished and the building now stands as a residential building at the present time. In addition, the building was built in the vicinity of the Bijou and was the last residential area built by the British Raj. Question: What was the name of the last residential area built by the United Kingdom of Great Britain and Ireland in 1913?
