In [1]:
from dotenv import load_dotenv
load_dotenv()

True

# Web Page Indexing and Vectorization üëÄ

This Jupyter notebook contains a script that performs indexing and vectorization of web page contents. The primary purpose of this script is to crawl through a specified web page, extract the textual contents, and subsequently store these contents as vector objects in a database.

The vectorized information can then be utilized in a Retrieval-Augmented Generation (RAG) flow to answer questions using a Language Model (LLM). This process enables the creation of a more context-aware and responsive system, capable of providing detailed responses based on the indexed and vectorized information from the web page.

The notebook is structured in a step-by-step manner, guiding you through the process of web page crawling, text extraction, vectorization, and storage in a database. Each step is accompanied by detailed explanations and code snippets to provide a comprehensive understanding of the process.

## Web Crawler and Content Extractor

This code implements a web crawler and content extractor that:

1. Extracts URLs from the given HTML content, filtering for the same domain and validating the URLs. ‚úÖ
2. Crawls a website starting from a given URL, iteratively processing and extracting links from each page. ‚úÖ
3. Returns a mist of HTML documents extracted from the website ‚úÖ

The code displays the source URL of each processed page and the total number of pages in the extracted content.

In [2]:
from trafilatura.spider import focused_crawler
from trafilatura.settings import use_config

newconfig = use_config("scraper_settings.cfg")

homepage = 'https://vectrix.ai'
to_visit, known_links = focused_crawler(homepage, max_seen_urls=1, config=newconfig)

print("TO VISIT \n", to_visit, "\n")
print("KNOWN LINKS \n", known_links, "\n")

TO VISIT 
 ['https://vectrix.ai/blog-post/your-ai-might-be-misleading-you-understanding-the-dual-nature-of-llm-outputs', 'https://vectrix.ai/contact-us', 'https://vectrix.ai/about-us', 'https://vectrix.ai/blog-post/understanding-large-and-small-language-models-key-differences-and-applications', 'https://vectrix.ai/platform', 'https://vectrix.ai/career', 'https://vectrix.ai/offerings', 'https://vectrix.ai/blog-post/google-deepminds-searchless-chess-engine---part-1', 'https://vectrix.ai/blog', 'https://vectrix.ai/blog-post/are-llm-benchmarks-and-leaderboards-just-marketing-tools'] 

KNOWN LINKS 
 ['https://vectrix.ai/', 'https://vectrix.ai/blog-post/your-ai-might-be-misleading-you-understanding-the-dual-nature-of-llm-outputs', 'https://vectrix.ai/contact-us', 'https://vectrix.ai/about-us', 'https://vectrix.ai/blog-post/understanding-large-and-small-language-models-key-differences-and-applications', 'https://vectrix.ai/platform', 'https://vectrix.ai/career', 'https://vectrix.ai/offerings'

In [3]:
to_visit, known_links = focused_crawler(homepage, 
                                        max_seen_urls=1000, 
                                        max_known_urls=100000, 
                                        todo=to_visit, 
                                        known_links=known_links,
                                        config=newconfig
                                        )

print("TO VISIT \n", to_visit, "\n")
print("KNOWN LINKS \n", known_links, "\n")

ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://vectrix.ai/robots.txt


TO VISIT 
 [] 

KNOWN LINKS 
 ['https://vectrix.ai/', 'https://vectrix.ai/blog-post/your-ai-might-be-misleading-you-understanding-the-dual-nature-of-llm-outputs', 'https://vectrix.ai/contact-us', 'https://vectrix.ai/about-us', 'https://vectrix.ai/blog-post/understanding-large-and-small-language-models-key-differences-and-applications', 'https://vectrix.ai/platform', 'https://vectrix.ai/career', 'https://vectrix.ai/offerings', 'https://vectrix.ai/blog-post/google-deepminds-searchless-chess-engine---part-1', 'https://vectrix.ai/blog', 'https://vectrix.ai/blog-post/are-llm-benchmarks-and-leaderboards-just-marketing-tools', 'https://vectrix.ai/blog-post/advanced-applications-and-future-trends-in-entity-analysis', 'https://vectrix.ai/job-list/open-application---create-your-own-dream-job', 'https://vectrix.ai/job-list/internship', 'https://vectrix.ai/job-list/junior-ai-researcher', 'https://vectrix.ai/job-list/software-engineer-front-end', 'https://vectrix.ai/offerings/projects', 'https://ve

In [4]:
from trafilatura.downloads import add_to_compressed_dict, buffered_downloads, load_download_buffer
from trafilatura import extract
from langchain_core.documents import Document
from tqdm.notebook import tqdm
import json, os
import hashlib
from vectrix.db import DB
# number of threads to use
threads = 10


db = DB(db_url=os.getenv('DB_URI'))

results = []

#  Extract the domain name from a URL
def extract_domain(url):
    from urllib.parse import urlparse
    return urlparse(url).netloc

domain_name = extract_domain(homepage)
already_downloaded = db.list_documents(domain_name="vectrix.ai")
to_download = [url for url in known_links if url not in already_downloaded]

# Add URLs to a compressed dictionary
url_store = add_to_compressed_dict(to_download)


# processing loop
with tqdm(total=len(to_download), desc="Processing URLs") as pbar:
    while url_store.done is False:
        bufferlist, url_store = load_download_buffer(url_store, sleep_time=0)
        # process downloads
        for url, result in buffered_downloads(bufferlist, threads):
            # do something here
            extracted_page = extract(result, output_format='json', include_links=True, with_metadata=True)


            if extracted_page:
                page_hash = hashlib.sha256(extracted_page.encode()).hexdigest()
                db.add_document(
                    url=url,
                    page_hash=page_hash,
                    domain_name=domain_name,
                    storage_location="",
                    content=json.loads(extracted_page)
                )

                pbar.update(1)

# Remove documents again
# db.remove_documents(domain_name="vectrix.ai")


INFO:root:Listing all pages for domain: vectrix.ai
[32m2024-08-12 17:06:10,122 - root - INFO - Listing all pages for domain: vectrix.ai[0m


Processing URLs: 0it [00:00, ?it/s]

## Data Preprocessing and Chunking
In this step we will split all the extracted web pages into logical chunks. 

‚û°Ô∏è We will use the [trafilatura](https://trafilatura.readthedocs.io/en/latest/) library to extract the main content of the web pages. It will return the main content of the page, the title, and the meta description.

‚û°Ô∏è We will pipe this to another splitter to further cut the sections into smaller chunks if they are too large. For this we use Langchains 

‚û°Ô∏è  Also we will attach an LLM to the chain to ignore chunks that are not relevant, for example: navigation bars, footers, etc.



### Chunking and metadata extraction
Using the functions below we extract the medata and devide the text into chunks. 

In [5]:
from vectrix.importers import chunk_content
webpages = db.get_documents(domain_name="vectrix.ai")
chunked_webpages = chunk_content(webpages)
# 


print(f"Before chunking we had {len(webpages)} and after chunking {len(chunked_webpages)}")

INFO:root:Getting all documents for domain: vectrix.ai
[32m2024-08-12 17:06:12,749 - root - INFO - Getting all documents for domain: vectrix.ai[0m


Before chunking we had 21 and after chunking 21


## Storing the result in a Weaviate (cluster)

### Initialize the Vector store and check that all the required modules are installed

Download the Docker compose file if needed
```bash
curl -o docker-compose.yml "https://configuration.weaviate.io/v2/docker-compose/docker-compose.yml?cohere_key_approval=yes&generative_anyscale=false&generative_aws=false&generative_cohere=false&generative_mistral=false&generative_octoai=false&generative_ollama=false&generative_openai=false&generative_palm=false&media_type=text&modules=modules&ner_module=false&qna_module=false&ref2vec_centroid=false&reranker_cohere=true&reranker_cohere_key_approval=yes&reranker_transformers=false&runtime=docker-compose&spellcheck_module=true&spellcheck_module_model=pyspellchecker-en&sum_module=false&text_module=text2vec-cohere&weaviate_version=v1.25.4&weaviate_volume=named-volume"
```

Make sure to set the persistent directory to the correct value:
```bash
    volumes:
    - ~/weaviate_data:/var/lib/weaviate
```

Also configure the Cohere API key:
```bash
environment:
      SPELLCHECK_INFERENCE_API: 'http://text-spellcheck:8080'
      COHERE_APIKEY: ***
```

In [7]:
from vectrix.db import Weaviate

weaviate = Weaviate()

INFO:httpx:HTTP Request: GET http://localhost:8080/v1/.well-known/openid-configuration "HTTP/1.1 404 Not Found"


[32m2024-08-12 17:07:41,337 - httpx - INFO - HTTP Request: GET http://localhost:8080/v1/.well-known/openid-configuration "HTTP/1.1 404 Not Found"[0m
INFO:httpx:HTTP Request: GET http://localhost:8080/v1/meta "HTTP/1.1 200 OK"
[32m2024-08-12 17:07:41,356 - httpx - INFO - HTTP Request: GET http://localhost:8080/v1/meta "HTTP/1.1 200 OK"[0m
INFO:httpx:HTTP Request: GET https://pypi.org/pypi/weaviate-client/json "HTTP/1.1 200 OK"
[32m2024-08-12 17:07:41,414 - httpx - INFO - HTTP Request: GET https://pypi.org/pypi/weaviate-client/json "HTTP/1.1 200 OK"[0m
I0000 00:00:1723475261.425448  160548 config.cc:230] gRPC experiments enabled: call_status_override_on_cancellation, event_engine_dns, event_engine_listener, http2_stats_fix, monitoring_experiment, pick_first_new, trace_record_callops, work_serializer_clears_time_cache


In [11]:
weaviate.create_collection(name='Vectrix', 
                           embedding_model='Ollama', 
                           model_name="mxbai-embed-large:335m",
                           model_url="http://host.docker.internal:11434")

INFO:httpx:HTTP Request: POST http://localhost:8080/v1/schema "HTTP/1.1 200 OK"
[32m2024-08-12 17:08:02,928 - httpx - INFO - HTTP Request: POST http://localhost:8080/v1/schema "HTTP/1.1 200 OK"[0m


In [9]:
print(weaviate.list_collections())

INFO:httpx:HTTP Request: GET http://localhost:8080/v1/schema "HTTP/1.1 200 OK"
[32m2024-08-12 17:07:54,706 - httpx - INFO - HTTP Request: GET http://localhost:8080/v1/schema "HTTP/1.1 200 OK"[0m


['Vectrix', 'Elmos', 'Loop']


In [None]:
weaviate.set_colleciton(name='Vectrix')

In [12]:
weaviate.add_data(chunked_webpages)

INFO:httpx:HTTP Request: GET http://localhost:8080/v1/schema/Vectrix "HTTP/1.1 200 OK"
[32m2024-08-12 17:08:30,601 - httpx - INFO - HTTP Request: GET http://localhost:8080/v1/schema/Vectrix "HTTP/1.1 200 OK"[0m
INFO:httpx:HTTP Request: GET http://localhost:8080/v1/nodes "HTTP/1.1 200 OK"
[32m2024-08-12 17:08:30,603 - httpx - INFO - HTTP Request: GET http://localhost:8080/v1/nodes "HTTP/1.1 200 OK"[0m
INFO:httpx:HTTP Request: GET http://localhost:8080/v1/nodes "HTTP/1.1 200 OK"
[32m2024-08-12 17:08:31,611 - httpx - INFO - HTTP Request: GET http://localhost:8080/v1/nodes "HTTP/1.1 200 OK"[0m


In [None]:
retriever = weaviate.get_retriever()
retriever.invoke('Who are the Vectrix founders ?')

In [10]:
weaviate.remove_collection("Vectrix")

INFO:httpx:HTTP Request: DELETE http://localhost:8080/v1/schema/Vectrix "HTTP/1.1 200 OK"
[32m2024-08-12 17:07:59,740 - httpx - INFO - HTTP Request: DELETE http://localhost:8080/v1/schema/Vectrix "HTTP/1.1 200 OK"[0m


In [None]:
weaviate.info()

In [None]:
weaviate.close()