In [None]:
import re  

def load_text(file_path):  
    with open(file_path, 'r', encoding='utf-8') as file:  
        text = file.read()  
    return text  

def preprocess_text(text):  
    # Remove non-English characters and symbols  
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Keep only English letters and whitespace  
    text = ' '.join(text.split())  # Remove extra spaces  
    text = text.lower()  # Convert to lowercase  
    return text  

def split_into_chunks(text, chunk_size=400):  # Reduced chunk size  
    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]  
    return chunks  

# Load and preprocess the text  
file_path = "/kaggle/input/bhagwat-gita/The Bhagavad Gita.txt"  # Replace with your file path  
text = load_text(file_path)  
cleaned_text = preprocess_text(text)  
chunks = split_into_chunks(cleaned_text)  

# Save preprocessed chunks (optional)  
with open('preprocessed_chunks.txt', 'w', encoding='utf-8') as file:  
    for chunk in chunks:  
        file.write(chunk + '\n')  

In [None]:
!pip install -U langchain-community

In [None]:
# Step 2: Generate embeddings and create a FAISS vector database  
from sentence_transformers import SentenceTransformer
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from typing import List, Optional

def create_vector_database(
    chunks: List[str], 
    model_name: str = 'sentence-transformers/all-MiniLM-L6-v2',
    save_path: Optional[str] = None
) -> FAISS:
    """
    Creates a FAISS vector database from text chunks using HuggingFace embeddings.
    
    Args:
        chunks (List[str]): List of text chunks to be embedded
        model_name (str): Name of the sentence transformer model to use
        save_path (Optional[str]): Path to save the vector database. If None, database won't be saved
    
    Returns:
        FAISS: A FAISS vector database containing the embeddings
        
    Raises:
        ValueError: If chunks list is empty
    """
    if not chunks:
        raise ValueError("Chunks list cannot be empty")
    
    # Initialize the embedding model
    embedding_function = HuggingFaceEmbeddings(
        model_name=model_name,
        model_kwargs={'device': 'cpu'}
    )
    
    # Create FAISS vector database
    vector_db = FAISS.from_texts(
        texts=chunks,
        embedding=embedding_function
    )
    
    # Save the database if path is provided
    if save_path:
        vector_db.save_local(save_path)
    
    return vector_db

In [None]:
!pip install faiss-cpu
!pip install huggingface_hub[hf_xet]

In [None]:
from sentence_transformers import SentenceTransformer  
from langchain.vectorstores import FAISS  

# Generate embeddings with truncation  
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')  
embeddings = model.encode(chunks, truncate=True)  # Truncate excess tokens  

# Create and save the vector database  
vector_db = FAISS.from_embeddings(  
    text_embeddings=list(zip(chunks, embeddings)),  
    embedding=embeddings[0]  
)  
vector_db.save_local("faiss_vector_db")  

In [None]:
def retrieve_relevant_chunks(query, vector_db, model, top_k=3):  
    query_embedding = model.encode([query])  
    relevant_chunks = vector_db.similarity_search_by_vector(query_embedding[0], k=top_k)  
    return relevant_chunks  

# Example retrieval  
query = "What is the concept of Karma in the Bhagavad Gita?"  
relevant_chunks = retrieve_relevant_chunks(query, vector_db, model)  
for i, chunk in enumerate(relevant_chunks):  
    print(f"Chunk {i+1}:\n{chunk.page_content}\n")  


In [None]:
!pip install google-generativeai langchain-google-genai sentence-transformers faiss-cpu  

In [None]:
!pip install google-generativeai sentence-transformers scikit-learn numpy  

In [None]:
import google.generativeai as genai  
from sentence_transformers import SentenceTransformer  
import numpy as np  
import os  
from sklearn.metrics.pairwise import cosine_similarity  

# 1. Configure Google Gemini  
os.environ['GOOGLE_API_KEY'] = 'AIzaSyDx7yJCneYXXTXP1I4ed03QN0saGwYRjHI'  # Replace with your actual API key  
genai.configure(api_key=os.environ['GOOGLE_API_KEY'])  

# 2. Initialize embedding model (no LangChain)  
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')  

# 3. Load your documents (replace with your actual document loading logic)  
documents = [  
    "The Bhagavad Gita teaches...",  # Your document 1  
    "Karma yoga emphasizes...",      # Your document 2  
     
]  

# 4. Pre-compute embeddings  
document_embeddings = embedding_model.encode(documents)  

def retrieve_relevant_documents(query, k=3):  
    """Semantic search without FAISS"""  
    query_embedding = embedding_model.encode(query)  
    similarities = cosine_similarity([query_embedding], document_embeddings)[0]  
    most_relevant = np.argsort(similarities)[-k:][::-1]  
    return [documents[i] for i in most_relevant]  

def get_answer(question):  
    try:  
        # Retrieve context  
        context = "\n".join(retrieve_relevant_documents(question))  
        
        # Generate answer using pure Gemini API  
        model = genai.GenerativeModel('gemini-1.5-pro-latest')  
        response = model.generate_content(  
            f"""Answer this question about the Bhagavad Gita:  
            
            Question: {question}  
            
            Context: {context}  
            
            Provide a detailed answer with relevant verses:"""  
        )  
        return response.text  
    except Exception as e:  
        return f"Error: {str(e)}"  

# Example usage  
if __name__ == "__main__":  
    question = "Explain the concept of Dharma in the Bhagavad Gita"  
    answer = get_answer(question)  
    print("Question:", question)  
    print("Answer:", answer)  

In [None]:
    question = "What is concept of Bhaktiyog and how it is different from gyanyog"  
    answer = get_answer(question)  
    print("Question:", question)  
    print("Answer:", answer) 

In [None]:
    question = "Explain the concept of krishna as Superior of all and give examples to support it"  
    answer = get_answer(question)  
    print("Question:", question)  
    print("Answer:", answer) 