In [22]:
import requests
from bs4 import BeautifulSoup
import pinecone
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv
import os

load_dotenv() 

def fetch_text(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup.get_text()

def chunk_text(text):
    return text.split('\n')

In [35]:
import random

def get_bible_verses():
    bible_url = 'https://openbible.com/textfiles/kjv.txt'
    bible_text = fetch_text(bible_url)
    print(f'Bible text length: {len(bible_text)} characters')
    bible_chunks = chunk_text(bible_text)
    print(f'Number of Bible chunks: {len(bible_chunks)}')
    
    parsed_verses = []
    for chunk in bible_chunks[2:]:  # Skip the first two lines
        parts = chunk.split('\t', 1)
        if len(parts) == 2:
            verse, text = parts
            parsed_verses.append((verse.strip(), text.strip()))
    
    print('Sample Bible verses:')
    sample_verses = random.sample(parsed_verses, 3)
    for verse in sample_verses:
        print(f'- {verse}')
    
    return parsed_verses

bible_verses = get_bible_verses()

Bible text length: 4606957 characters
Number of Bible chunks: 31105
Sample Bible verses:
- ('1 Samuel 30:25', 'And it was [so] from that day forward, that he made it a statute and an ordinance for Israel unto this day.')
- ('Numbers 15:15', 'One ordinance [shall be both] for you of the congregation, and also for the stranger that sojourneth [with you], an ordinance for ever in your generations: as ye [are], so shall the stranger be before the LORD.')
- ('Luke 2:40', 'And the child grew, and waxed strong in spirit, filled with wisdom: and the grace of God was upon him.')


In [40]:
def get_muslim_text():
    muslim_url = 'https://www.sacred-texts.com/isl/quran.htm'
    muslim_text = fetch_text(muslim_url)
    print(f'Muslim text length: {len(muslim_text)} characters')
    return muslim_text

def chunk_muslim_text(text):
    chunks = chunk_text(text)
    print(f'Number of Muslim chunks: {len(chunks)}')
    if chunks:
        print(f'Sample Muslim chunk: {chunks[0]}')
    return chunks

muslim_text = get_muslim_text()
muslim_chunks = chunk_muslim_text(muslim_text)

print('First 5 Muslim chunks:')
for chunk in muslim_chunks[:5]:
    print(chunk)
    print('---')

Muslim text length: 28166 characters
Number of Muslim chunks: 858
Sample Muslim chunk: 
First 5 Muslim chunks:

---

---

---

---

---


In [11]:

# Get Pinecone API key from environment variable
pinecone_api_key = os.environ.get('PINECONE_API_KEY')

if not pinecone_api_key:
    raise ValueError('PINECONE_API_KEY environment variable is not set')

print(f'Pinecone API key: {pinecone_api_key[:5]}...{pinecone_api_key[-5:]}')


# Initialize Pinecone (replace with your actual API key and environment)
pinecone.init(api_key='YOUR_PINECONE_API_KEY', environment='YOUR_PINECONE_ENVIRONMENT')
index_name = 'religious_texts'

# Create index if it doesn't exist
if index_name not in pinecone.list_indexes():
    pinecone.create_index(index_name, dimension=384)  # Adjust dimension based on your embedding model

# Connect to the index
index = pinecone.Index(index_name)

# Initialize the embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Prepare and store data
for source, text in [('Bible', bible_text), ('Buddhist', buddhist_text), ('Muslim', muslim_text)]:
    chunks = chunk_text(text)
    for i, chunk in enumerate(chunks):
        # Create embedding
        embedding = model.encode(chunk).tolist()

        # Prepare metadata
        metadata = {
            'source': source,
            'chunk_id': i
        }

        # Upsert to Pinecone
        index.upsert(vectors=[(f'{source}_{i}', embedding, metadata)])

print('Sacred texts have been retrieved, chunked, and stored in the Pinecone database.')



Pinecone API key: 88884...7aa7d


NameError: name 'pinecone' is not defined