In [24]:
import os
from dotenv import load_dotenv ,find_dotenv
load_dotenv(find_dotenv(), override=True)
if os.environ:
    for api_key in os.environ:
        if "API_KEY" in api_key:
            print(api_key)
else:
    import getpass
    os.environ["GOOGLE_API_KEY"] = getpass.getpass("GOOGLE_API_KEY")
    os.environ["PINECONE_API_KEY"] = getpass.getpass("PINECONE_API_KEY")

CLAUDE_API_KEY
GOOGLE_API_KEY
HUGGINGFACE_API_KEY
OPENAI_API_KEY
PINECONE_API_KEY


In [25]:
def load_docs(docs_urls=["https://pypi.org/"]):
    from langchain.document_loaders.async_html import AsyncHtmlLoader
    print("loading started....")
    loader = AsyncHtmlLoader(docs_urls)
    documents = loader.load()
    return documents

In [26]:
def clean_html(html_page:str, title:str):
    from pprint import pprint
    from bs4 import BeautifulSoup
    parser = BeautifulSoup(html_page, "html.parser")
    # pprint(parser.prettify())
    with open(f"files/{title}.txt", "w",encoding="utf-8") as f:
        for string in parser.strings:
            if string !="\n":
                f.write(string.strip())
                f.write("\n")

In [27]:
urls = [
    "https://fsciences.univ-setif.dz/main_page/english",
]
file_titles = []
html_pages = load_docs(urls)
for i,html_page in enumerate(html_pages):
    cleaned_file_title = (
        urls[i]
        .replace("/", "_")
        .replace(".", "_")
        .replace("-", "_")
        .replace("https:", "")
        .replace("dz", "")
        .replace("net", "")
        .replace("com", "")
        .replace("org", "")
        .replace("edu", "")
        .strip("_")
    )
    clean_html(
        html_page.page_content,
        cleaned_file_title
    )
    file_titles.append(cleaned_file_title)

loading started....


Fetching pages: 100%|##########| 1/1 [00:14<00:00, 14.69s/it]


In [28]:
from typing import List

def chunks_loader(
        text:str #, pdfs, docs,....etc
)->List[str]:
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=200,
        chunk_overlap=20
    )
    chunks = text_splitter.create_documents([text])# [texts, pdfs, docs,.... ]    return chunks

In [29]:
from pprint import pprint


chunks_list = []
for title in file_titles:
    with open(f'files/{title}.txt',"r", encoding="utf-8") as f:
        text = f.read()
        chunks = chunks_loader(text)
        chunks_list.append(chunks)
        pprint(len(chunks))

30


In [31]:
def insert_or_fetch_embeddings(chunks,index_name="site-web-faculte-science-info"):
    import pinecone
    from langchain_community.vectorstores import Pinecone
    from langchain_google_genai.embeddings import GoogleGenerativeAIEmbeddings
    pc = pinecone.Pinecone()
    embedding = GoogleGenerativeAIEmbeddings(model='models/text-embedding-004')
    if index_name in pc.list_indexes().names():
        print(f"index {index_name} already exists"  )
        vector_store = Pinecone.from_existing_index(index_name, embedding)
    else:
        print(f"creating the index {index_name}")
        pc.create_index(
            name=index_name,
            dimension=768,
            metric="cosine",
            # metric="euclidean",
            spec=pinecone.PodSpec(environment="gcp-starter"),
        )
        print(f"Done Creation of {index_name}")

    vector_store = Pinecone.from_documents(chunks, embedding, index_name=index_name)
    print(f"completed fetching the index {index_name}")
    return vector_store

In [32]:
def delete_indexes(index_name="all"):
    import pinecone
    pc = pinecone.Pinecone()
    if index_name == "all":
        for index in pc.list_indexes().names():
            pc.delete_index(index)
        print("all indexes has been deleted")
    else:
        pc.delete_index(index_name)
        print(f"{index_name} has been deleted")


In [34]:
delete_indexes()

all indexes has been deleted


In [35]:
vector_store = insert_or_fetch_embeddings(chunks_list[0])

# for chunks in chunks_list:
#     insert_or_fetch_embeddings(chunks)



creating the index site-web-faculte-science-info
Done Creation of site-web-faculte-science-info
completed fetching the index site-web-faculte-science-info


In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI
gemini = ChatGoogleGenerativeAI(model="gemini-pro", temperature=1)
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain


while True:
    query = input("give your query:")

    # enhanced_prompt = PromptTemplate.from_template("translate to arabic {query}")
    # chain = LLMChain(llm=gemini, prompt=enhanced_prompt, verbose=True)
    # query = chain.invoke(input=query)
    if query in ["quit", "exit"]:
        break
    top_result = vector_store.similarity_search(query)
    # pprint(top_result)
    results = [result.page_content for result in top_result]
    results = "\n".join(results)
    # print(results)
    enhanced_prompt = PromptTemplate.from_template(
        "based on this search results:\n {results} \ngive a summerizations"
    )
    chain = LLMChain(llm=gemini, prompt=enhanced_prompt, verbose=True)
    response = chain.invoke(input=top_result)
    print(response["text"])