In [1]:
import json
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

# Path to your JSON file
json_file_path = r"nifty_50_processed_data.json"

# Load the data
with open(json_file_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Initialize the embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Prepare texts and metadata
texts = []
metadata = []

for company_data in data:  # Assuming top-level contains lists of chunks per company
    for chunk in company_data:
        texts.append(chunk["text"])
        metadata.append(chunk["metadata"])

# Generate embeddings for all text chunks
embeddings = model.encode(texts, show_progress_bar=True)

# Convert embeddings to numpy array
embeddings_np = np.array(embeddings, dtype="float32")

# Initialize a FAISS index
dimension = embeddings_np.shape[1]  # Dimensionality of embeddings
index = faiss.IndexFlatL2(dimension)  # L2 distance index

# Add embeddings to the FAISS index
index.add(embeddings_np)

# Save the FAISS index to disk
faiss_index_path = r"nifty_50_index.faiss"
faiss.write_index(index, faiss_index_path)

# Save metadata to a JSON file
metadata_path = r"nifty_50_metadata.json"
with open(metadata_path, "w", encoding="utf-8") as f:
    json.dump(metadata, f, indent=4)

print("Embeddings and metadata have been successfully stored.")


  from .autonotebook import tqdm as notebook_tqdm


RuntimeError: Failed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):
Failed to import transformers.modeling_utils because of the following error (look up to see its traceback):
Could not find the operator torchvision::nms. Please make sure you have already registered the operator and (if registered from C++) loaded it via torch.ops.load_library.

In [10]:
import json
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

# Paths to your FAISS index and metadata files
faiss_index_path = r"nifty_50_index.faiss"

processed_data_path = r"nifty_50_processed_data.json"

# Load FAISS index
index = faiss.read_index(faiss_index_path)



# Load and flatten processed data for text chunks
with open(processed_data_path, "r", encoding="utf-8") as f:
    nested_data = json.load(f)

# Flatten the nested structure into a single list
processed_data = [item for sublist in nested_data for item in sublist]

# Initialize the SentenceTransformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

def retrieve(query, k=5):
    """Retrieve top-k relevant text chunks and metadata for a given query."""
    # Embed the query
    query_embedding = model.encode([query])

    # Search the FAISS index
    distances, indices = index.search(np.array(query_embedding, dtype="float32"), k)

    # Fetch corresponding metadata and text
    results = []
    for idx, distance in zip(indices[0], distances[0]):
        if idx != -1:  # Ensure valid index
            # Find the corresponding text and metadata
            chunk = processed_data[idx]  # Access the correct flattened item
            result = {
                "text": chunk["text"],
                "metadata": chunk["metadata"],
                "distance": distance
            }
            results.append(result)

    return results

# Example usage
query = "Who is Gautambhai Adani ?"
results = retrieve(query, k=3)

# Display the results
for i, result in enumerate(results):
    print(f"Result {i + 1}:")
    print(f"Text: {result['text']}")
    print(f"Metadata: {result['metadata']}")
    print(f"Distance: {result['distance']}")
    print("-" * 50)


Result 1:
Text: Adani Enterprises Limited (Symbol: ADANIENT.NS) executive details: 
1. Name: Mr. Gautambhai Shantilal Adani S.Y. B.Com, Title: Executive Chairman, Total Pay: 24600000.0.
2. Name: Mr. Jugeshinder  Singh, Title: Chief Financial Officer, Total Pay: 97400000.0.
3. Name: Mr. Jatinkumar  Jalundhwala B.Com LLB FCS, Title: Joint President of Legal, Compliance Officer & Company Secretary, Total Pay: 36700000.0.
4. Name: Mr. Rajeshbhai Shantilal Adani B.Com., Title: MD of AEL & Director, Total Pay: 83700000.0.
5. Name: Mr. Vinay Prakash Goel, Title: Chief Executive Officer of Natural Resources, Total Pay: 893700000.0.
6. Name: Mr. Pranav Vinodbhai Adani, Title: MD of Agro, Oil & Gas and Executive Director, Total Pay: 64600000.0.
7. Name: Mr. Sudipta  Bhattacharya, Title: Group C.T.O, Total Pay: Not Disclosed.
8. Name: Ms. Varsha  Chainani, Title: Head of Corporate Communications, Total Pay: Not Disclosed.
9. Name: Dr. Malay Rameshchandra Mahadevia, Title: Group Director of HR, To

In [1]:
#we are using this one

import json
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer, CrossEncoder

# Paths to files
json_file_path = r"nifty_50_processed_data.json"
faiss_index_path = r"nifty_50_index.faiss"

# Initialize embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")
cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")  # For re-ranking

# Load and flatten processed data
with open(json_file_path, "r", encoding="utf-8") as f:
    nested_data = json.load(f)

# Flatten the nested structure into a single list
processed_data = [item for sublist in nested_data for item in sublist]

# Prepare texts and metadata
texts = [item["text"] for item in processed_data]
metadata = [item["metadata"] for item in processed_data]

# Generate and normalize embeddings for FAISS
embeddings = model.encode(texts, show_progress_bar=True)
embeddings_np = np.array(embeddings, dtype="float32")
faiss.normalize_L2(embeddings_np)  # Normalize embeddings for cosine similarity

# Initialize FAISS index for Inner Product search
dimension = embeddings_np.shape[1]
index = faiss.IndexFlatIP(dimension)  # Inner Product for cosine similarity
index.add(embeddings_np)  # Add embeddings to index

# Save the FAISS index to disk
faiss.write_index(index, faiss_index_path)

# Query and retrieval function
def retrieve(query, k=5, filter_metadata=None):
    """Retrieve top-k relevant text chunks for a given query."""
    # Embed and normalize the query
    query_embedding = model.encode([query], normalize_embeddings=True)
    
    # Search FAISS index
    distances, indices = index.search(np.array(query_embedding, dtype="float32"), k)
    
    # Fetch corresponding metadata and text
    results = []
    for idx, distance in zip(indices[0], distances[0]):
        if idx != -1:
            chunk = processed_data[idx]
            if filter_metadata is None or chunk["metadata"] == filter_metadata:
                results.append({
                    "text": chunk["text"],
                    "metadata": chunk["metadata"],
                    "distance": distance
                })
    
    # Re-rank results using CrossEncoder
    if results:
        rerank_scores = cross_encoder.predict([(query, result["text"]) for result in results])
        results = sorted(zip(results, rerank_scores), key=lambda x: x[1], reverse=True)
        results = [result for result, _ in results]
    
    return results





  from .autonotebook import tqdm as notebook_tqdm
Batches: 100%|██████████| 7/7 [00:02<00:00,  2.34it/s]


In [2]:
query = "Who is Gautambhai Adani ?"
top_k_results = retrieve(query, k=3)

# Display the results
for i, result in enumerate(top_k_results):
    print(f"Result {i + 1}:")
    print(f"Text: {result['text']}")
    print(f"Metadata: {result['metadata']}")
    print(f"Distance: {result['distance']}")
    print("-" * 50)

Result 1:
Text: Adani Ports and Special Economic Zone Limited (Symbol: ADANIPORTS.NS) executive details: 
1. Name: Mr. Gautambhai Shantilal Adani S.Y. B.Com, Title: Executive Chairman, Total Pay: 68000000.0.
2. Name: Mr. Ashwani  Gupta, Title: Whole-Time Director & CEO, Total Pay: 125592347.0.
3. Name: Mr. D.  Muthukumaran, Title: Chief Financial Officer, Total Pay: 591000.0.
4. Name: Mr. Karan Gautam Adani, Title: MD & Director, Total Pay: 39000000.0.
5. Name: Mr. Charanjit  Singh, Title: Head - ESG & IR, Total Pay: Not Disclosed.
6. Name: Mr. Kamlesh  Bhagia, Title: Company Secretary & Compliance Officer, Total Pay: Not Disclosed.
7. Name: Captain Sandeep M. Mehta, Title: President of Business Development, Total Pay: Not Disclosed.
8. Name: Mr. Subrat  Tripathy, Title: Chief Executive Officer of Ports, Total Pay: Not Disclosed.
9. Name: Mr. Divij  Taneja, Title: Chief Executive Officer of Logistics, Total Pay: Not Disclosed.
10. Name: Mr. Rajesh Kumar Jha, Title: MD & CEO of Adani Vi

In [52]:
from transformers import AutoTokenizer, AutoModelForCausalLM

In [53]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load LLaMA 2 tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-chat-hf",
    device_map="auto",  # Automatically map to GPU if available
    torch_dtype="float16"  # Use FP16 for faster inference on GPU
)

# Function to format the prompt and generate a response
def generate_llm_response(query, top_k_results, max_length=200):
    """
    Generate a response using LLaMA 2 based on retrieved results.

    Args:
    - query (str): The user query.
    - top_k_results (list): List of retrieved documents with text and metadata.
    - max_length (int): Maximum length of the generated response.

    Returns:
    - response (str): The response generated by LLaMA 2.
    """
    # Format context from retrieved results
    context = "\n".join([f"Result {i + 1}: {result['text']}" for i, result in enumerate(top_k_results)])
    
    # Create the prompt
    prompt = f"""
    You are a helpful assistant. Answer the following question using the provided context:

    Context:
    {context}

    Question: {query}

    Answer:
    """
    
    # Tokenize the prompt
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")  # Move inputs to GPU if available
    
    # Generate the response
    outputs = model.generate(
        inputs["input_ids"],
        max_length=max_length,
        temperature=0.7,  # Control randomness
        top_p=0.9,        # Nucleus sampling
        do_sample=True    # Enable sampling
    )
    
    # Decode and return the generated text
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example query
query = "Who is Gautambhai Adani?"
top_k_results = retrieve(query, k=3)  # Retrieve top-k relevant results

# Generate a response using LLaMA 2
refined_response = generate_llm_response(query, top_k_results)

# Display the refined response
print("Refined Response:")
print(refined_response)


OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/meta-llama/Llama-2-7b-chat-hf.
401 Client Error. (Request ID: Root=1-675fc9b1-10f431e13da7e00213b122a7;9b646a48-840d-446f-9a43-2a4041252514)

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/resolve/main/config.json.
Access to model meta-llama/Llama-2-7b-chat-hf is restricted. You must have access to it and be authenticated to access it. Please log in.

In [42]:
from llama_index import KeywordTableIndex, QueryEngine, Document, GPTSimpleVectorIndex
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import json

# Paths to files
json_file_path = r"D:\RAG\nifty_50_processed_data.json"
faiss_index_path = r"D:\RAG\nifty_50_index.faiss"

# Initialize LlamaIndex
# 1. Loading FAISS Index
dimension = 384  # Change if your model output dimension is different
index = faiss.read_index(faiss_index_path)

# 2. Load processed data
with open(json_file_path, "r", encoding="utf-8") as f:
    nested_data = json.load(f)

# Flatten the data
processed_data = [item for sublist in nested_data for item in sublist]

# 3. Build documents for LlamaIndex
documents = [
    Document(text=item["text"], extra_info=item["metadata"])
    for item in processed_data
]

# Create a keyword index (optional)
keyword_index = SimpleKeywordTableIndex.from_documents(documents)

# Create the vector index using GPTSimpleVectorIndex
vector_index = GPTSimpleVectorIndex.from_documents(documents)

# Save the vector index for later use
vector_index.save_to_disk("nifty50_vector_index.json")


ImportError: cannot import name 'KeywordTableIndex' from 'llama_index' (unknown location)

In [54]:
from llama_index.core import SummaryIndex, Document

ImportError: cannot import name 'SummaryIndex' from 'llama_index.core' (unknown location)