In [69]:
import requests
from bs4 import BeautifulSoup
import pinecone
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv
import os

load_dotenv() 

def fetch_text(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup.get_text()

def chunk_text(text):
    return text.split('\n')

In [None]:
import openai
from openai import OpenAI
import os

# Set up OpenAI API key
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def generate_embeddings(text_tuples):
    embeddings = []
    
    for metadata, content in text_tuples:
        # Combine metadata and content
        combined_text = f"{metadata} {content}"
        
        try:
            # Generate embedding using OpenAI's API
            response = client.embeddings.create(
                model="text-embedding-ada-002",
                input=combined_text
            )
            
            # Extract the embedding from the response
            embedding = response.data[0].embedding
            embeddings.append(embedding)
        except Exception as e:
            print(f"Error generating embedding for {metadata}: {str(e)}")
    
    return embeddings


# Christianity

In [35]:
import random

def get_bible_verses():
    bible_url = 'https://openbible.com/textfiles/kjv.txt'
    bible_text = fetch_text(bible_url)
    print(f'Bible text length: {len(bible_text)} characters')
    bible_chunks = chunk_text(bible_text)
    print(f'Number of Bible chunks: {len(bible_chunks)}')
    
    parsed_verses = []
    for chunk in bible_chunks[2:]:  # Skip the first two lines
        parts = chunk.split('\t', 1)
        if len(parts) == 2:
            verse, text = parts
            parsed_verses.append((verse.strip(), text.strip()))
    
    print('Sample Bible verses:')
    sample_verses = random.sample(parsed_verses, 3)
    for verse in sample_verses:
        print(f'- {verse}')
    
    return parsed_verses

bible_verses = get_bible_verses()

Bible text length: 4606957 characters
Number of Bible chunks: 31105
Sample Bible verses:
- ('1 Samuel 30:25', 'And it was [so] from that day forward, that he made it a statute and an ordinance for Israel unto this day.')
- ('Numbers 15:15', 'One ordinance [shall be both] for you of the congregation, and also for the stranger that sojourneth [with you], an ordinance for ever in your generations: as ye [are], so shall the stranger be before the LORD.')
- ('Luke 2:40', 'And the child grew, and waxed strong in spirit, filled with wisdom: and the grace of God was upon him.')


# Islam 

In [65]:
import requests

# Muhammad Asad translation, (surah) number followed by the verse (ayah) number.
def get_all_verses_in_english():
    base_url = 'http://api.alquran.cloud/v1/surah/{}/en.asad'
    verses = []
    
    for surah_number in range(1, 115):  # Surahs are numbered from 1 to 114
        url = base_url.format(surah_number)
        response = requests.get(url)
        
        if response.status_code != 200:
            print(f"Failed to retrieve Surah {surah_number}")
            continue
        
        data = response.json()
        for ayah in data['data']['ayahs']:
            ayah_number = ayah['numberInSurah']
            verse_id = f"{surah_number}:{ayah_number}"
            text_content = ayah['text']
            verses.append((verse_id, text_content))
    return verses

# Example usage:
verses = get_all_verses_in_english()
for verse in verses[:5]:  # Print the first 5 verses
    print(verse)


('1:1', 'In the name of God, The Most Gracious, The Dispenser of Grace:')
('1:2', 'All praise is due to God alone, the Sustainer of all the worlds,')
('1:3', 'The Most Gracious, the Dispenser of Grace,')
('1:4', 'Lord of the Day of Judgment!')
('1:5', 'Thee alone do we worship; and unto Thee alone do we turn for aid.')


In [66]:
import random

def print_random_verses(verses, num_verses=5):
    """
    Print a specified number of random verses from the list of verses.
    
    :param verses: List of tuples containing (verse_id, verse_text)
    :param num_verses: Number of random verses to print (default: 5)
    """
    selected_verses = random.sample(verses, min(num_verses, len(verses)))
    
    print(f'Printing {num_verses} random verses:')
    for verse_id, verse_text in selected_verses:
        print(f'Verse {verse_id}:')
        print(verse_text)
        print('---')

# Print 5 random verses
print_random_verses(verses)


Printing 5 random verses:
Verse 38:13:
and [the tribe of] Thamud, and the people of Lot, and the dwellers of the wooded dales [of Madyan]: they all were leagued together, [as it were, in their unbelief:]
---
Verse 79:37:
For, unto him who shall have transgressed the bounds of what is right,
---
Verse 24:13:
why do they not [demand of the accusers that they] their allegation? for, if they do not produce such wit­nesses, it is those [accusers] who, in the sight of God, are liars indeed!
---
Verse 40:63:
[For] thus it is: perverted are the minds of those who knowingly reject God’s messages.
---
Verse 13:7:
However, they who are bent on denying the truth [refuse to believe and] say, "Why has no miraculous sign ever been bestowed on him from on high by his Sustainer?" [But] thou art only a warmer; and [in God] all people have a guide.
---


In [68]:
# Get Pinecone API key from environment variable
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')

if not PINECONE_API_KEY:
    raise ValueError('PINECONE_API_KEY environment variable is not set')

print(f'Pinecone API key: {PINECONE_API_KEY[:5]}...{PINECONE_API_KEY[-5:]}')

# Initialize Pinecone
from pinecone import Pinecone

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = 'religious_texts'

# Create index if it doesn't exist
if index_name not in pinecone.list_indexes():
    pinecone.create_index(index_name, dimension=384)  # Adjust dimension based on your embedding model

# Connect to the index
index = pinecone.Index(index_name)

# Initialize the embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Prepare and store data
for source, text in [('Bible', bible_text), ('Buddhist', buddhist_text), ('Muslim', muslim_text)]:
    chunks = chunk_text(text)
    for i, chunk in enumerate(chunks):
        # Create embedding
        embedding = model.encode(chunk).tolist()

        # Prepare metadata
        metadata = {
            'source': source,
            'chunk_id': i
        }

        # Upsert to Pinecone
        index.upsert(vectors=[(f'{source}_{i}', embedding, metadata)])

print('Sacred texts have been retrieved, chunked, and stored in the Pinecone database.')



Pinecone API key: 88884...7aa7d


AttributeError: init is no longer a top-level attribute of the pinecone package.

Please create an instance of the Pinecone class instead.

Example:

    import os
    from pinecone import Pinecone, ServerlessSpec

    pc = Pinecone(
        api_key=os.environ.get("PINECONE_API_KEY")
    )

    # Now do stuff
    if 'my_index' not in pc.list_indexes().names():
        pc.create_index(
            name='my_index', 
            dimension=1536, 
            metric='euclidean',
            spec=ServerlessSpec(
                cloud='aws',
                region='us-west-2'
            )
        )

