In [1]:
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
import weaviate
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()

api_key = os.getenv("GEMINI_API_KEY")
weaviate_api_key = os.getenv("WEAVIATE_API_KEY")
weaviate_url = os.getenv("WEAVIATE_URL")

llm = ChatGoogleGenerativeAI(
    api_key=api_key,
    model="gemini-2.0-flash"
)


In [2]:
import nest_asyncio

nest_asyncio.apply()

### WebBaseLoader

In [None]:
from langchain_community.document_loaders import WebBaseLoader

loader_multiple_pages = WebBaseLoader(
    ["https://www.xevensolutions.com/"]
)

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [4]:
loader_multiple_pages.load()

[Document(metadata={'source': 'https://www.xevensolutions.com/', 'title': 'Xeven Solutions - AI Development & Solutions Company', 'description': 'Xeven Solutions is a leading AI Development & Solutions Company providing custom AI-based software services to automate workflow and boost innovation.', 'language': 'en-US'}, page_content="\n\n\n\n\n\n\n\nXeven Solutions - AI Development & Solutions Company\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n \n\n\n\n\n\n \n\n\n\n\n\n\n\n\n\n\n\n\nServices\n\nAI Development Services AI Chatbot Development Predictive Modelling\u200b Mobile App Development Chat GPT Integrations Custom Software Natural Language Processing Machine Learning DevOps Computer Vision\u200b Custom Web Development Staff Augmentation UI UX Design\n\nSalesforce\nIndustries\n\nHealthTech EdTech FinTech GreenTech Internet of Things R

### RecursiveUrlLoader

In [42]:
from langchain_community.document_loaders import RecursiveUrlLoader

loader = RecursiveUrlLoader(
    "https://academy.dhruvrathee.com",
)

In [26]:
docs = loader.load()

### SitemapLoader

In [40]:
from langchain_community.document_loaders.sitemap import SitemapLoader

sitemap_loader = SitemapLoader(web_path="https://www.xevensolutions.com//sitemap.xml")

In [39]:
response = sitemap_loader.load()

Fetching pages: 0it [00:00, ?it/s]


In [25]:
for i in response:
    print(' '.join(i.page_content.split()))

The Best Role of Natural Language Processing in Global Business Services AI Development Services AI Chatbot Development Predictive Modelling​ Mobile App Development Chat GPT Integrations Custom Software Natural Language Processing Machine Learning DevOps Computer Vision​ Custom Web Development Staff Augmentation UI UX Design Salesforce Industries HealthTech EdTech FinTech GreenTech Internet of Things Retail AI Diagnostics E-Commerce Smart Healthcare HIPAA Compliance Portfolio Company About Us Life at Xeven Resource Blogs Gallery Careers Contact Us X 971-56-813-6243 Free AI Consultation Services AI Development Services AI Chatbot Development Predictive Modelling​ Mobile App Development Chat GPT Integrations Custom Software Natural Language Processing Machine Learning DevOps Computer Vision​ Custom Web Development Staff Augmentation UI UX Design Salesforce Industries HealthTech EdTech FinTech GreenTech Internet of Things Retail AI Diagnostics E-Commerce Smart Healthcare HIPAA Compliance 

## function for load website content

In [24]:
import nest_asyncio

nest_asyncio.apply()


from langchain_community.document_loaders.sitemap import SitemapLoader

def load_website_content(website_url):
    
    if website_url.endswith('/'):
        website_url = website_url[:-1]
    
    sitemap_url = f"{website_url}/sitemap.xml"
    
    print(f"Attempting to load sitemap from: {sitemap_url}")
    sitemap_loader = SitemapLoader(web_path=sitemap_url)
    
    documents = sitemap_loader.load()
    
    print(f"Successfully loaded {len(documents)} pages from {website_url}")
    return documents

In [25]:
website_content = load_website_content("https://academy.dhruvrathee.com/")

Attempting to load sitemap from: https://academy.dhruvrathee.com/sitemap.xml


Fetching pages: 0it [00:00, ?it/s]

Successfully loaded 0 pages from https://academy.dhruvrathee.com





In [None]:

for i in website_content:
    print(' '.join(i.page_content.split()))
    

Devsinc | Leading Software & Product Development Agency In USA What we do CapabilitiesDigital TransformationWeb developmentApp DevelopmentCustom Software DevelopmentUX/UI DesignBusiness ApplicationsD365 ERPD365 CRMPower AppsEmerging TechnologiesMetaverseAugmented realityBlockchain & CryptographyData & AIGen AIData AnalyticsStaff AugmentationQuality AssuranceDevOpsCybersecuritySaaSE-commerceDesign & DevelopmentMaintenance & SupportAutomation & AppsGamingArt & DesignWeb3AR/VR/XRCloudCloud ApplicationCloud Ops & MigrationCloud maintenance & integrationWho we help IndustriesTravel & HospitalityPublic Sector TelecommunicationRetail & CPGOil, Gas, and EnergyStartupsE-commerceBanking & FintechHealthcare & PharmaceuticalsGamingWho We Are AboutLeadershipGeographiesAwards & RecognitionMedia & Investor RelationsESG ValuesCode of Conduct & ValuesHow we deliver BlogsThought LeadershipCase StudiesWhitepapersPlaybooksPerspectivePodcastClient TestimonialsJoin devsinc CareersCultureDiversity, Equity an

## function for splitting web content

In [47]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_content(documents, chunk_size=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
    )
    chunks = text_splitter.split_documents(documents)

    return chunks

In [48]:
docs = split_content(website_content)

In [49]:
len(docs)

1697

# function for the setup vector db

In [27]:
from langchain_weaviate.vectorstores import WeaviateVectorStore
from weaviate.classes.init import Auth



def setup_vector_database(docs):
    embeddings = GoogleGenerativeAIEmbeddings(
        google_api_key=api_key, 
        model="models/embedding-001"
    )

    client = weaviate.connect_to_weaviate_cloud(
        cluster_url=weaviate_url,
        auth_credentials=Auth.api_key(weaviate_api_key),
        skip_init_checks=True
    )


    vector_db = WeaviateVectorStore.from_documents(docs, embeddings, client=client, collection_name= 'mycollection')
    return vector_db

In [None]:

from langchain_core.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser


def create_rag_chain(vector_db):
    """Creates a RAG (Retrieval-Augmented Generation) chain using Weaviate as the retriever."""

    
    template = """You are an assistant for question-answering tasks. 
    Use the following pieces of retrieved context to answer the question. 
    If you don't know the answer, just say that you don't know. 
    
    Question: {question} 
    Context: {context} 
    
    Answer:"""
    
    prompt = ChatPromptTemplate.from_template(template)
    retriever = vector_db.as_retriever(search_kwargs={"k": 5})
    
    rag_chain = (
        {"context": retriever, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )
    
    return rag_chain


In [None]:
chain = create_rag_chain(vector_db)

In [62]:
result = chain.invoke("give me contact details of usman")
print(result)

Usman Asif is the Founder and CEO of Devsinc. You can reach him via LinkedIn:

https://www.linkedin.com/in/usman-asif-15038a22/


In [5]:
from langchain_community.document_loaders import SitemapLoader, WebBaseLoader, RecursiveUrlLoader
import time
from bs4 import BeautifulSoup



def scrape_website_content(website_url, timeout=30):

    if website_url.endswith('/'):
        website_url = website_url[:-1]
        
    def html_extractor(html_content):
        soup = BeautifulSoup(html_content, 'html.parser')
        for script in soup(["script", "style", "nav", "footer"]):
            script.extract()
        return soup.get_text(separator=' ', strip=True)
    
    try:
        sitemap_url = f"{website_url}/sitemap.xml"
        sitemap_loader = SitemapLoader(
            web_path=sitemap_url,
            continue_on_failure=True,
            requests_per_second=2  
        )
        documents = sitemap_loader.load()

        if documents:
            return documents
    except Exception as e:
        print(f"Sitemap loading failed: {e}")
    
    try:
        print(f"Attempting recursive loading...")
        recursive_loader = RecursiveUrlLoader(
            url=website_url,
            extractor=html_extractor,
      
        )
        documents = recursive_loader.load()
        print(f"Successfully loaded {len(documents)} pages recursively")
        if documents:
            return documents
    except Exception as e:
        print(f"Recursive loading failed: {e}")
    
    try:
        print(f"Attempting to load main page as final fallback...")
        loader = WebBaseLoader(
            website_url,
            continue_on_failure=True,
        )
        
        documents = loader.load()
        print(f"Loaded {len(documents)} pages with basic loader")
        return documents
    except Exception as e:
        print(f"All loading methods failed. Final error: {e}")
        return []
   

In [6]:
import os
def save_website_content(website_content, output_dir='data'):
    os.makedirs(output_dir, exist_ok=True)
    
    text = []
    for i in website_content:
        clean_content = ' '.join(i.page_content.split())
        text.append(clean_content)

    file_path = os.path.join(output_dir, "website_content.txt")
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write('\n\n'.join(text))
    return file_path

In [7]:
website_content = scrape_website_content("https://zabihullah.pythonanywhere.com/")

Fetching pages: 0it [00:00, ?it/s]


Attempting recursive loading...
Successfully loaded 7 pages recursively


In [14]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_content(documents, chunk_size=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
    )
    chunks = text_splitter.split_documents(documents)

    return chunks

splits = split_content(website_content)

In [15]:
len(splits)

7

In [None]:


# embeddings = GoogleGenerativeAIEmbeddings(
#         google_api_key=api_key, 
#         model="models/embedding-001"
#     )
# document_embeddings = embeddings.embed_documents([split.page_content for split in splits])

In [None]:
# from langchain_chroma import Chroma
# collection_name = "my_collection"
# vectorstore = Chroma.from_documents(collection_name=collection_name, documents=splits, embedding=embeddings, persist_directory="./chroma_db")
# db.persist()

In [None]:
# query = "tell me about zabih"
# retriever = vectorstore.as_retriever(search_kwargs={"k": 2})
# retriever.invoke(query)

[Document(id='f04245d1-cc20-4d75-88fb-3ce8917dd9d2', metadata={'content_type': 'text/html; charset=utf-8', 'language': 'en', 'source': 'https://zabihullah.pythonanywhere.com/about/', 'title': 'Zabih ullah - Portfolio'}, page_content="Zabih ullah - Portfolio About Me Hello! I'm Zabih ullah Ai/ML Intern @JMM Technologies based in Peshawar, Pakistan. AI and ML engineer with nearly one year of hands-on experience in developing intelligent applications. Successfully developed AI chatbots, predictive models, and web applications using advanced technologies like Langchain and Fastapi. Skilled in Python, data analysis, and deploying AI-driven solutions to enhance software capabilities. Email: Zabihullah18381@gmail.com Phone: 03190904793 Location: Peshawar, Pakistan Experience: 0+ Years Download CV Contact Me LinkedIn GitHub Twitter"),
 Document(id='895d5108-ad02-48db-9922-274b5b960cd5', metadata={'content_type': 'text/html; charset=utf-8', 'language': 'en', 'source': 'https://zabihullah.python

# for ocr