In [2]:
# embeddings.ipynb

import sys
from pathlib import Path

# Add your Django project to Python path
project_path = Path.cwd()  # Go up one directory level
sys.path.append(str(project_path))

# Setup Django environment
import os
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'config.settings')  # adjust if your settings are different

# Now you can import your Django models/settings if needed

# Import required libraries
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
import pickle
import logging

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load embedding model
logger.info("Loading embedding model...")
model = SentenceTransformer("all-MiniLM-L6-v2")

# Load texts
base_path = Path("datasets")
all_texts = []
all_sources = []

# Load text files from spiritual_texts directory
spiritual_texts_path = base_path / "spiritual_texts"
if spiritual_texts_path.exists():
    logger.info("Loading text files...")
    for txt_file in spiritual_texts_path.glob("*.txt"):
        try:
            with open(txt_file, "r", encoding="utf-8") as f:
                content = f.read()
                paragraphs = [p.strip() for p in content.split("\n\n") if p.strip()]
                all_texts.extend(paragraphs)
                all_sources.extend([txt_file.stem] * len(paragraphs))
            logger.info(f"Loaded {len(paragraphs)} paragraphs from {txt_file.name}")
        except Exception as e:
            logger.error(f"Error loading {txt_file}: {str(e)}")
            continue

# Load CSVs
try:
    logger.info("Loading CSV files...")
    # Buddha quotes
    buddha_quotes = pd.read_csv(base_path / "buddha_quotes.csv")
    all_texts.extend(buddha_quotes["quote"].tolist())
    all_sources.extend(["Buddha"] * len(buddha_quotes))

    # Asana benefits
    asana_benefits = pd.read_csv(base_path / "asana_benefits.csv")
    all_texts.extend(asana_benefits["description"].tolist())
    all_sources.extend(["Yoga"] * len(asana_benefits))

    # Meditation
    meditation = pd.read_csv(base_path / "meditation.csv")
    all_texts.extend(meditation["text"].tolist())
    all_sources.extend(["Meditation"] * len(meditation))

    # Rumi poetry
    rumi_poetry = pd.read_excel(base_path / "rumi_poetry.xlsx")
    all_texts.extend(rumi_poetry["poem"].tolist())
    all_sources.extend(["Rumi"] * len(rumi_poetry))

except Exception as e:
    logger.error(f"Error loading CSV/Excel files: {str(e)}")

# Create embeddings
logger.info(f"Creating embeddings for {len(all_texts)} texts...")
embeddings = model.encode(all_texts, show_progress_bar=True)

# Create knowledge base
knowledge_base = {
    "texts": all_texts,
    "sources": all_sources,
    "embeddings": embeddings,
}

# Save embeddings
cache_path = base_path / "cached_embeddings.pkl"
logger.info(f"Saving embeddings to {cache_path}")
with open(cache_path, "wb") as f:
    pickle.dump(knowledge_base, f)

logger.info("Done! Embeddings saved successfully.")

# Optional: Test the embeddings
test_query = "What is the meaning of life?"
query_embedding = model.encode(test_query)
similarities = np.dot(embeddings, query_embedding)
top_indices = np.argsort(similarities)[-3:][::-1]

print("\nTesting embeddings with query:", test_query)
for idx in top_indices:
    print(f"\nFrom {all_sources[idx]}:")
    print(all_texts[idx])
    print(f"Similarity score: {similarities[idx]:.4f}")

  from tqdm.autonotebook import tqdm, trange
INFO:__main__:Loading embedding model...
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:__main__:Loading text files...
INFO:__main__:Loaded 4280 paragraphs from Autobiography-of-a-Yogi-by-Paramhansa-Yogananda.txt
INFO:__main__:Loaded 9551 paragraphs from vedic_hymns_pt_2.txt
ERROR:__main__:Error loading datasets/spiritual_texts/Hildegard Writings.txt: 'utf-8' codec can't decode byte 0x92 in position 347: invalid start byte
INFO:__main__:Loaded 862 paragraphs from History of Zoroastrianism - M.N. Dhalla.txt
ERROR:__main__:Error loading datasets/spiritual_texts/kitab i ilqan book of certitude.txt: 'utf-8' codec can't decode byte 0x97 in position 646: invalid start byte
INFO:__main__:Loaded 2774 paragraphs from Occult_Theocracy.txt
INFO:__main__:Loaded 11290 paragraphs from vedic_hymns_pt_1.txt
INFO:__main__:Loa

KeyboardInterrupt: 