In [27]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# URL of the page to fetch
url = 'https://docs.e2enetworks.com'

# Get the HTML content of the page
response = requests.get(url)
html_content = response.text

# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

# Find all the anchor tags in the HTML
anchor_tags = soup.find_all('a', href=True)

# Initialize a set to store unique working hrefs
unique_links = set()

# Check each anchor tag for a valid href attribute
for tag in anchor_tags:
    # Ensure the href attribute is an absolute URL
    link = urljoin(url, tag['href'])
    # Add the link to the set (duplicates are automatically avoided)
    unique_links.add(link)

# Open a file to write the working hrefs
with open('working_links.txt', 'w') as file:
    for link in unique_links:
        try:
            # Send a request to the link
            link_response = requests.head(link, allow_redirects=True, timeout=5)
            # Check if the link is working
            if link_response.status_code == 200:
                print(f"Writing {link} in the text file")
                file.write(link + '\n')
        except requests.RequestException as e:
            # If there was a problem with the request (e.g. timeout, DNS failure, etc.)
            print(f"Could not retrieve {link}: {e}")

Writing https://www.e2enetworks.com/product/windows-on-e2e-cloud in the text file
Writing https://docs.e2enetworks.com/computes/nodes/index.html in the text file
Writing https://docs.e2enetworks.com/kubernetes/agro_cd.html#deploy-to-kubernetes-using-argo-cd-and-gitops in the text file
Writing https://www.e2enetworks.com/team in the text file
Writing https://docs.e2enetworks.com/kubernetes/troubleshooting.html in the text file
Writing https://www.e2enetworks.com/products/ in the text file
Writing https://docs.e2enetworks.com/faas_doc/faas.html#how-to-create-functions in the text file
Writing https://docs.e2enetworks.com/AI_ML/index.html in the text file
Writing https://docs.e2enetworks.com/AI_ML/introduction.html in the text file
Writing https://docs.e2enetworks.com/abuse/phishing.html in the text file
Writing https://docs.e2enetworks.com#getting-started in the text file
Writing https://docs.e2enetworks.com/containerregistry/index.html in the text file
Writing https://docs.e2enetworks.c

In [28]:
unique_links

{'https://docs.e2enetworks.com',
 'https://docs.e2enetworks.com#content',
 'https://docs.e2enetworks.com#e2e-networks-documentation',
 'https://docs.e2enetworks.com#getting-started',
 'https://docs.e2enetworks.com#id1',
 'https://docs.e2enetworks.com#our-ai-first-infrastructure',
 'https://docs.e2enetworks.com/',
 'https://docs.e2enetworks.com/AI_ML/Billing.html',
 'https://docs.e2enetworks.com/AI_ML/analytics.html',
 'https://docs.e2enetworks.com/AI_ML/api_tokens.html',
 'https://docs.e2enetworks.com/AI_ML/committed_notebook.html',
 'https://docs.e2enetworks.com/AI_ML/container.html',
 'https://docs.e2enetworks.com/AI_ML/dataset.html',
 'https://docs.e2enetworks.com/AI_ML/fine_tuning_docs.html',
 'https://docs.e2enetworks.com/AI_ML/getting_started.html',
 'https://docs.e2enetworks.com/AI_ML/gpu_notebook_h100.html',
 'https://docs.e2enetworks.com/AI_ML/howdoi/index.html',
 'https://docs.e2enetworks.com/AI_ML/index.html',
 'https://docs.e2enetworks.com/AI_ML/introduction.html',
 'https:

In [1]:
pip install requests beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [None]:
# Read URLs from a file and store them in a list
with open('links.txt', 'r') as file:
    urls_list = [line.strip() for line in file if line.strip()]

print(urls_list)

['https://docs.e2enetworks.com#content', 'https://docs.e2enetworks.com', 'https://docs.e2enetworks.com', 'https://www.e2enetworks.com', 'https://myaccount.e2enetworks.com', 'https://myaccount.e2enetworks.com', 'https://docs.e2enetworks.com', 'https://docs.e2enetworks.com', 'https://www.e2enetworks.com', 'https://docs.e2enetworks.com/myaccountnew/index.html', 'https://docs.e2enetworks.com/myaccountnew/Indian_customer_Signup.html', 'https://docs.e2enetworks.com/myaccountnew/Customer_validation_indian.html', 'https://docs.e2enetworks.com/myaccountnew/International_customer_Signup.html', 'https://docs.e2enetworks.com/myaccountnew/Customer_validation1.html', 'https://docs.e2enetworks.com/myaccountnew/customer_validation.html', 'https://docs.e2enetworks.com/myaccountnew/customer_validation_faq.html', 'https://docs.e2enetworks.com/myaccountnew/myaccount_faq.html', 'https://docs.e2enetworks.com/myaccountnew/sign_in.html', 'https://docs.e2enetworks.com/release.html', 'https://docs.e2enetworks.c

In [8]:
from langchain_community.document_loaders import WebBaseLoader
loader = WebBaseLoader(urls_list)
docs = loader.load_and_split()

In [9]:
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
BitsAndBytesConfig,
pipeline
)

import torch

import streamlit as st

import os

from langchain.llms import HuggingFacePipeline
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate, format_document
from langchain_core.runnables import RunnableLambda, RunnablePassthrough, RunnableParallel
from langchain_core.output_parsers import StrOutputParser
from langchain.memory import ConversationBufferMemory
from langchain_core.messages import AIMessage, HumanMessage, get_buffer_string

from operator import itemgetter

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
def load_llm():

    #Loading the Mistral Model
    model_name='mistralai/Mistral-7B-Instruct-v0.2'
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16
    )


    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
    )

    # Building a LLM text-generation pipeline
    text_generation_pipeline = pipeline(
        model=model,
        tokenizer=tokenizer,
        task="text-generation",
        temperature=0.2,
        repetition_penalty=1.1,
        return_full_text=True,
        max_new_tokens=1024,
    )

    llm = HuggingFacePipeline(pipeline= text_generation_pipeline)

    return llm

In [11]:
def embeddings_model():
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-mpnet-base-v2"
    )
    return embeddings

In [33]:
def initialize_vectorstore():

    # Read URLs from the links.txt file and store them in a list
    with open('links.txt', 'r') as file:
        urls_list = [line.strip() for line in file if line.strip()]

    #Initializing a text_splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=20,
        length_function=len,
        is_separator_regex=False,
    )

    #Loading all the content of the urls in docs format
    loader = WebBaseLoader(urls_list)
    docs = loader.load_and_split(text_splitter=text_splitter)

    vectorstore = FAISS.from_documents(
        docs, embedding=hf_embeddings
    )
    retriever = vectorstore.as_retriever(search_kwargs = {'k':10})
    return retriever

In [13]:
def return_chain_elements():

    #template to get the Standalone question
    _template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.

        Chat History:
        {chat_history}
        Follow Up Input: {question}
        Standalone question:
    """
    CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)

    #Function to create the context from retrieved documents
    DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")
    def _combine_documents(
        docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, document_separator="\n\n"
    ):
        doc_strings = [format_document(doc, document_prompt) for doc in docs]
        return document_separator.join(doc_strings)

    #Creating the template for the final answer
    template = """Answer the question based only on the following context:
        {context}

        Question: {question}
    """
    ANSWER_PROMPT = ChatPromptTemplate.from_template(template)


    # Now we calculate the standalone question
    standalone_question = {
        "standalone_question": {
            "question": lambda x: x["question"],
            "chat_history": lambda x: get_buffer_string(x["chat_history"]),
        }
        | CONDENSE_QUESTION_PROMPT
        | llm
        | StrOutputParser(),
    }

    # Now we retrieve the documents
    retrieved_documents = {
        "docs": itemgetter("standalone_question") | vector_retriever,
        "question": lambda x: x["standalone_question"],
    }

    # Now we construct the inputs for the final prompt
    final_inputs = {
        "context": lambda x: _combine_documents(x["docs"]),
        "question": itemgetter("question"),
    }

    # And finally, we do the part that returns the answers
    answer = {
        "answer": final_inputs | ANSWER_PROMPT | llm,
        "docs": itemgetter("docs"),
    }

    return standalone_question, retrieved_documents, answer


In [38]:
llm = load_llm()

hf_embeddings = embeddings_model()

vector_retriever = initialize_vectorstore()

standalone_question, retrieved_documents, answer = return_chain_elements()

In [15]:
conversational_memory = ConversationBufferMemory(
        return_messages=True, output_key="answer", input_key="question"
    )

In [17]:
loaded_memory = RunnablePassthrough.assign(
    chat_history=RunnableLambda(conversational_memory.load_memory_variables) | itemgetter("history"),
)

In [18]:
chain = loaded_memory | standalone_question | retrieved_documents | answer

In [19]:
inputs = {"question": "How do I start a GPU node?"}

In [35]:
result = chain.invoke(inputs)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [21]:
print(result)

{'answer': '\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

In [39]:
result["docs"]

[Document(page_content='How to Launch GPU H100 notebook | E2E Networks  documentation\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n      Skip to content\n    \n\n\n\nE2E Cloud\n\nDocs\nE2E Networks\n\n\n\n\nToggle navigation menu\n\n\n\n\n\n              Login\n            \n\n\n\n              Sign Up\n            \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nE2E Networks  documentation\n\n\n\n\n\n⌘\n    K\n  \n\n\n\nDocs\nE2E Networks\n\nMyaccount\n\n        Getting Started with Myaccount\nSignUp Process for Indian Customers\nCustomer validation Process for Indian Customers\nSignUp Process for International Customers\nCustomer Validation Process for International Process\nCustomer Validation Process for Contact Persons\nDomestic Customer Validation Process FAQs\nInternational Customer Validation Process FAQs\nSign In Process\n\n\n   Release Notes\n\nCompute\n\nNodes\n Virtual Compute Nodes\n Monitoring\n 1-Click Deployment\n Active Directory\n\n\nGPU\nGPU Cloud\n\n\nEQS\nIntroduction\nHow to Create 

In [43]:
DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")
def _combine_documents(
    docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, document_separator="\n\n"
):
    doc_strings = [format_document(doc, document_prompt) for doc in docs]

    return document_separator.join(doc_strings)

In [45]:
y =_combine_documents(docs[1:4])

['\n\n\n\n\n\n\n\n\n\nE2E Networks Documentation | E2E Networks  documentation\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n      Skip to content\n    \n\n\n\nE2E Cloud\n\nDocs\nE2E Networks\n\n\n\n\nToggle navigation menu\n\n\n\n\n\n              Login\n            \n\n\n\n              Sign Up\n            \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nE2E Networks  documentation\n\n\n\n\n\n⌘\n    K\n  \n\n\n\nDocs\nE2E Networks\n\nMyaccount\n\n        Getting Started with Myaccount\nSignUp Process for Indian Customers\nCustomer validation Process for Indian Customers\nSignUp Process for International Customers\nCustomer Validation Process for International Process\nCustomer Validation Process for Contact Persons\nDomestic Customer Validation Process FAQs\nInternational Customer Validation Process FAQs\nSign In Process\n\n\n   Release Notes\n\nCompute\n\nNodes\n Virtual Compute Nodes\n Monitoring\n 1-Click Deployment\n Active Directory\n\n\nGPU\nGPU Cloud\n\n\nEQS\nIntroduction\nHow to Create EQS ?\nAc