In [17]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from bs4 import BeautifulSoup as Soup
import os

# Define paths
url = "https://mydukaan.io/"
chroma_db = 'vectorstore/db_chroma'

def process_url_content(content):
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    # Check if the content is a tuple (content, metadata)
    if isinstance(content, tuple) and len(content) >= 1:
        content = content[0]  # Extract the content from the tuple
    
    # print(content)
    texts = text_splitter.split_documents([content])  # Wrap content in a list for processing
    print(texts)
    
    # Initialize HuggingFaceEmbeddings using a specific model
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', model_kwargs={'device': 'cpu'})
    
    # Create a vector store using FAISS from the text content and embeddings
    db = Chroma.from_documents(texts,embeddings, persist_directory=chroma_db)
    
    # Save the vector store locally
    # db.save_local(os.path.join(chroma_db, f"url_content_db"))
    # print(f"Vector store saved for content from URL")

def process_urls_sequentially():
    # Ensure the directory exists
    if not os.path.exists(chroma_db):
        os.makedirs(chroma_db)
        # print(f"Created directory: {chroma_db}")

    # Load content from URL using RecursiveUrlLoader
    loader = RecursiveUrlLoader(
        url=url, max_depth=2, extractor=lambda x: Soup(x, "html.parser").text,
    )
    contents = loader.load()
    
    # print(f"URL content loaded from {url}")
    
    for content in contents:
        process_url_content(content)
        # print(f"Processing content from URL sequentially")

if __name__ == "__main__":
    process_urls_sequentially()


[Document(page_content="Dukaan - Enterprise Ecommerce Platform for BusinessesProductsDukaan themesDiscover themes from our curated collection & start with the one perfect for your business.Dukaan deliveryYour pan-India hassle-free shipping partner.Dukaan pluginsAdd extra functionality, features, and customization with the help of plugins.Business toolsFree tools to help take your business to the next level.CompanyCareersJoin the team and be a part of the rocketship.AboutThe who, what, and why of Dukaan.ResourcesBlogGet useful tips on how to start & grow your online business.CommunityBecome a part of our exclusive Dukaan VIP Facebook group with over 50k+ members.VideosAcquire skills to setup and run your online store from our videos and tutorials.Help centerAdvice and answers from the Dukaan Team.PricingSign inStart freeYour Global Commerce Partner, Engineered for Peak PerformanceLaunch your eye-catching online store with ease, attract and convert more customers than ever before.Get sta

Created a chunk of size 2324, which is longer than the specified 1000
Created a chunk of size 1199, which is longer than the specified 1000


[Document(page_content='The Dukaan® Blog - Start, Run and Grow Your Online Business.', metadata={'source': 'https://mydukaan.io/blog', 'title': 'The Dukaan® Blog - Start, Run and Grow Your Online Business.', 'description': 'Start, Run and Grow Your Online Business.', 'language': 'en-US'}), Document(page_content="Skip to main content Skip to primary sidebarAdditional menuThe Dukaan® BlogStart, Run and Grow Your Online Business.Updates\nBusiness Ideas\nSell Online\nSales & Marketing\nBusiness Tools\nComparison\nFull List of Features on Dukaan & Product Updates\n Last updated on: February 19, 2023byJyotbir LambaIf you've been wondering what are the latest feature updates on Dukaan, here's all you need to …Continue Reading about Full List of Features on Dukaan & Product Updates →Dukaan Dimensions 2022 – A Retrospective on the Growth Enabled by Dukaan\n Published: December 30, 2022  Team DukaanIt's been a whirlwind of a year for Dukaan, and we can't believe it's almost over! As we look back

Created a chunk of size 2324, which is longer than the specified 1000
Created a chunk of size 1199, which is longer than the specified 1000


[Document(page_content='The Dukaan® Blog - Start, Run and Grow Your Online Business.', metadata={'source': 'https://mydukaan.io/blog/', 'title': 'The Dukaan® Blog - Start, Run and Grow Your Online Business.', 'description': 'Start, Run and Grow Your Online Business.', 'language': 'en-US'}), Document(page_content="Skip to main content Skip to primary sidebarAdditional menuThe Dukaan® BlogStart, Run and Grow Your Online Business.Updates\nBusiness Ideas\nSell Online\nSales & Marketing\nBusiness Tools\nComparison\nFull List of Features on Dukaan & Product Updates\n Last updated on: February 19, 2023byJyotbir LambaIf you've been wondering what are the latest feature updates on Dukaan, here's all you need to …Continue Reading about Full List of Features on Dukaan & Product Updates →Dukaan Dimensions 2022 – A Retrospective on the Growth Enabled by Dukaan\n Published: December 30, 2022  Team DukaanIt's been a whirlwind of a year for Dukaan, and we can't believe it's almost over! As we look bac

In [16]:
# load from disk
# create the open-source embedding function
query="what is dukan"
embedding_function = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', model_kwargs={'device': 'cpu'})
db3 = Chroma(persist_directory="vectorstore\db_chroma", embedding_function=embedding_function)
docs = db3.similarity_search(query)
print(docs[0].page_content)

page_content="Dukaan Branding - Logo Downloads & Brand GuidelinesProductsDukaan themesDiscover themes from our curated collection & start with the one perfect for your business.Dukaan deliveryYour pan-India hassle-free shipping partner.Dukaan pluginsAdd extra functionality, features, and customization with the help of plugins.Business toolsFree tools to help take your business to the next level.CompanyCareersJoin the team and be a part of the rocketship.AboutThe who, what, and why of Dukaan.ResourcesBlogGet useful tips on how to start & grow your online business.CommunityBecome a part of our exclusive Dukaan VIP Facebook group with over 50k+ members.VideosAcquire skills to setup and run your online store from our videos and tutorials.Help centerAdvice and answers from the Dukaan Team.PricingSign inStart freeLogo Downloads & Brand GuidelinesDownload vector and .png versions of our logo. We ask you to respect our branding guidelines and not alter the logo in any way, shape or form. Prima