In [45]:
!pip install transformers torch numpy openai



In [46]:
from transformers import AutoTokenizer, AutoModel
import numpy as np
import os
import re
import uuid
import torch
import json

In [47]:
directory_path = "/content/documents"
os.makedirs(directory_path, exist_ok=True)


In [48]:
def load_documents(directory_path):
    documents = {}  # Dictionary to store document ID and text content

    for filename in os.listdir(directory_path):
        file_path = os.path.join(directory_path, filename)
        if os.path.isfile(file_path):
            # Generate unique document ID
            doc_id = str(uuid.uuid4())

            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()

            documents[doc_id] = {"text": text, "metadata": {"file_name": filename}}

    return documents

# Load documents and print to check the content
documents = load_documents(directory_path)
print("Loaded Documents:", documents)




In [49]:
def chunk_text(documents, model_name="bert-base-uncased", chunk_size=200):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    all_chunks = {}  # Dictionary to store all chunks with document and chunk IDs

    for doc_id, doc_data in documents.items():
        text = doc_data["text"]
        paragraphs = re.split("\n\n", text)  # Split text by paragraphs

        for paragraph in paragraphs:
            words = paragraph.split()  # Split paragraph into words
            current_chunk = ""
            chunks = []

            # Iterate over words to create chunks
            for word in words:
                new_chunk = current_chunk + " " + word if current_chunk else word
                # Check if the chunk size exceeds the limit
                if len(tokenizer.tokenize(new_chunk)) <= chunk_size:
                    current_chunk = new_chunk
                else:
                    chunks.append(current_chunk)
                    current_chunk = word

            if current_chunk:
                chunks.append(current_chunk)

            # Assign a unique ID to each chunk
            for chunk_text in chunks:
                chunk_id = str(uuid.uuid4())
                all_chunks[chunk_id] = {
                    "text": chunk_text,
                    "metadata": {"doc_id": doc_id, "file_name": doc_data["metadata"]["file_name"]}
                }

    return all_chunks

# Apply chunking function to documents and print a sample to verify
all_chunks = chunk_text(documents)
print("Sample Chunk:", list(all_chunks.items())[:1])  # Print a sample chunk to check




import numpy as np
import torch

def generate_embeddings(data, model, tokenizer):
    embeddings = {}

    for chunk_id, chunk_data in data.items():
        # Tokenize the text
        inputs = tokenizer(chunk_data["text"], return_tensors="pt", padding=True, truncation=True)

        # Generate embeddings
        with torch.no_grad():
            output = model(**inputs)
            # Assuming we want the mean of the last hidden state
            embedding = output.last_hidden_state.mean(dim=1).squeeze().numpy()  # Convert to numpy array

        embeddings[chunk_id] = {
            "embedding": embedding,
            "metadata": chunk_data["metadata"],
            "text": chunk_data["text"]  # Store the original text if needed
        }

    return embeddings




In [50]:
# Generate embeddings for all chunks and print a sample to verify
model_name = 'bert-base-uncased'  # Example model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
embeddings = generate_embeddings(all_chunks, model, tokenizer)

print("Embedding Shape:", embeddings[list(embeddings.keys())[0]]['embedding'].shape)


Embedding Shape: (768,)


In [51]:
import json

def save_json(file_path, data):
    """
    Save a dictionary to a JSON file.

    Parameters:
        file_path (str): Path to save the JSON file.
        data (dict): Dictionary to be saved.
    """
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

# Prepare documents and their embeddings for saving
documents = {}  # To store text chunks
mapped_document_db = {}  # To store embeddings

# Populate the dictionaries
for chunk_id, chunk_data in all_chunks.items():  # Iterate through your chunks
    text = chunk_data['text']  # Assuming each chunk has a 'text' key
    embedding = embeddings[chunk_id]['embedding'].tolist()  # Convert to list if necessary

    # Populate documents with text
    documents[chunk_id] = text

    # Populate the vector store with embeddings
    mapped_document_db[chunk_id] = embedding

# Save the documents and vector store to JSON files
save_json('doc_store.json', documents)
save_json('vector_store.json', mapped_document_db)

print("Document and vector stores saved to JSON files.")


Document and vector stores saved to JSON files.


In [52]:
# Print the structure of the embeddings to check for keys
for chunk_id, chunk_data in embeddings.items():
    print(f"Chunk ID: {chunk_id}")
    print(f"Embedding Shape: {chunk_data['embedding'].shape}")  # Shape of the embedding
    print(f"Metadata: {chunk_data['metadata']}")
    print(f"Text: {chunk_data['text']}\n")  # Print the text if available


Chunk ID: 0abd8de3-f9fc-45ff-8876-44d7306cc91c
Embedding Shape: (768,)
Metadata: {'doc_id': 'fa3e1444-f338-453b-a2aa-fcf30b2fab10', 'file_name': 'squad_samples_New_Delhi.txt'}

Chunk ID: a8edb042-46ab-4577-9d03-d4560d332813
Embedding Shape: (768,)
Metadata: {'doc_id': 'fa3e1444-f338-453b-a2aa-fcf30b2fab10', 'file_name': 'squad_samples_New_Delhi.txt'}
Text: Sample 1 ID: 5706899e75f01819005e7be8 Question: The Delhi Flying Club was established in what year? Context: The Delhi Flying Club, established in 1928 with two de Havilland Moth aircraft named Delhi and Roshanara, was based at Safdarjung Airport which started operations in 1929, when it was the Delhi's only airport and the second in India. The airport functioned until 2001, however in January 2002 the government closed the airport for flying activities because of security concerns following the New York attacks in September 2001. Since then, the club only carries out aircraft maintenance courses, and is used for helicopter rides to 

In [53]:
from sklearn.metrics.pairwise import cosine_similarity

def retrieve_top_k(query_embedding, embeddings, k=5):
    # Calculate cosine similarity between the query and chunk embeddings
    similarities = {}
    for chunk_id, chunk_data in embeddings.items():
        similarity = cosine_similarity([query_embedding], [chunk_data['embedding']])[0][0]
        similarities[chunk_id] = similarity

    # Sort by similarity and return top k results
    top_k = sorted(similarities.items(), key=lambda item: item[1], reverse=True)[:k]
    return top_k


In [54]:
# Example query text
query = "What is the name of the last residential area built by the British Raj in Delhi?"

# Tokenize and generate the query embedding
query_inputs = tokenizer(query, return_tensors="pt", padding=True, truncation=True)
with torch.no_grad():
    query_embeddings = model(**query_inputs).last_hidden_state.mean(dim=1).squeeze()  # Get mean of last hidden state
query_embeddings = query_embeddings.numpy()  # Convert to numpy array
normalized_query = query_embeddings / np.linalg.norm(query_embeddings)  # Normalize the query embedding


In [55]:
# Retrieve the top K results using the normalized query embedding
top_k_results = retrieve_top_k(normalized_query, embeddings, k=5)

# Print the top K results
print("Top K Retrieved Results:")
for chunk_id, similarity in top_k_results:
    chunk_text = embeddings[chunk_id]["text"]  # Access the text of the chunk
    metadata = embeddings[chunk_id]["metadata"]
    print(f"Chunk ID: {chunk_id}, Similarity: {similarity:.4f}, File: {metadata['file_name']}, Text: {chunk_text}")


Top K Retrieved Results:
Chunk ID: d6aeea37-3440-4f47-9d82-5c971e5eaf2a, Similarity: 0.8321, File: squad_samples_New_Delhi.txt, Text: Sample 99 ID: 5705e82775f01819005e7741 Question: What was the name of the last residential area built by the British Raj? Context: After the capital of India moved to Delhi, a temporary secretariat building was constructed in a few months in 1912 in North Delhi. Most of the government offices of the new capital moved here from the 'Old secretariat' in Old Delhi (the building now houses the Delhi Legislative Assembly), a decade before the new capital was inaugurated in 1931. Many employees were brought into the new capital from distant parts of India, including the Bengal Presidency and Madras Presidency. Subsequently housing for them was developed around Gole Market area in the 1920s. Built in the 1940s, to house government employees, with bungalows for senior officials in the nearby Lodhi Estate area, Lodhi colony near historic Lodhi Gardens, was the la

In [85]:
import openai
from transformers import pipeline

# Assuming `top_k_results` and `documents` are defined previously
first_match = top_k_results[0]  # You can choose a different match if needed
chunk_id = first_match[0]  # Chunk ID of the most similar document
relevant_text = documents[chunk_id]

# Load the GPT-2 model
generator = pipeline('text-generation', model='gpt2', device=0, pad_token_id=50256)

def generate_llm_response(query, relevant_text):
    try:
        # Construct the prompt with both relevant text and the query
        prompt = f"Given the following relevant information: {relevant_text}\n\nQuestion: {query}\nAnswer:"

        # Generate text with a bit more room for the response
        response = generator(prompt, max_new_tokens=100, num_return_sequences=1, truncation=True)

        # Extract the answer from the generated text
        generated_text = response[0]['generated_text'].strip()

        # Assuming the answer is the text after the "Answer:" label
        answer = generated_text.split('Answer:')[1].strip() if 'Answer:' in generated_text else generated_text
        return answer

    except Exception as e:
        print(f"Error generating response: {e}")
        return None

response = generate_llm_response(query, relevant_text)
print("Generated Response:", response)



Generated Response: The Royal Academy, Delhi

[The Royal Academy] in Delhi was the sole building on India's eastern fringe for the colonial era, in part as a response to the failure of Westerners in Delhi. It is an official residence of the British Raj and a part of the colonial regime in Delhi. While it was never formally built, the building remains a important symbol for Delhi's unique future because its proximity to the western fringe and a number of Indian cities like Mumbai, Mumbai, and Bandra
