In [16]:
import json
from pathlib import Path
from typing import Union, List
import requests

from bs4 import BeautifulSoup
from weaviate.util import generate_uuid5
import weaviate

In [2]:
def get_links(filepath: str = "../data/links.txt") -> list:
    with open(filepath, "r") as f:
        links = [line.strip() for line in f.readlines()]
    return links

In [3]:
def request_article(link: str):
    response = requests.get(link)
    html = response.text
    return html

In [4]:
def write_html_to_file(filepath: str, html) -> None:
    with open(filepath, "w") as f:
        f.write(html)

def read_html_from_file(filepath):
    with open(filepath, "r") as f:
        html = f.read()
    return html

In [5]:
def parse_paragraphs(html) -> List[str]:
    soup = BeautifulSoup(html)
    results = soup.article.find_all("p")
    paragraphs = [p.text for p in results]
    return paragraphs

def parse_summary(html) -> str:
    soup = BeautifulSoup(html)
    results = soup.head.find_all(property="og:description")
    summary = results[0]["content"]
    return summary

def parse_title(html) -> str:
    soup = BeautifulSoup(html)
    results = soup.head.find_all(property="og:title")
    title = results[0]["content"]
    return title

def parse_source(html) -> str:
    soup = BeautifulSoup(html)
    results = soup.head.find_all(property="og:site_name")
    source = results[0]["content"]
    return source

In [6]:
get_links()

['https://txt.cohere.ai/topic-modeling-trending-ai-papers/',
 'https://txt.cohere.ai/llm-parameters-best-outputs-language-ai/',
 'https://huyenchip.com/2022/08/03/stream-processing-for-data-scientists.html',
 'https://multithreaded.stitchfix.com/blog/2022/08/02/configuration-driven-ml-pipelines/']

In [7]:
html = request_article("https://txt.cohere.ai/topic-modeling-trending-ai-papers/")

In [22]:
parse_source(html)

'Context by Cohere'

In [32]:
def create_article_obj(html: str) -> dict:
    article_obj = dict(
        title=parse_title(html),
        summary=parse_summary(html),
        source=parse_source(html)
    )
    return article_obj

In [33]:
article = create_article_obj(html)

In [34]:
def load_schema_classes(schema_path) -> dict:
    schema_path = Path(schema_path)

    schema_classes = dict(classes=[])
    for file in schema_path.iterdir():
        if file.suffix == ".json":
            with open(file) as f:
                schema = json.load(f)
                schema_classes["classes"].append(schema)

    return schema_classes

In [35]:
def load_article(client, html) -> None:
    with client.batch() as batch:
        article_obj = create_article_obj(html)
        article_uuid = generate_uuid5(article_obj, "Article")

        batch.add_data_object(data_object=article_obj, class_name="Article", uuid=article_uuid)

        for idx, paragraph in enumerate(parse_paragraphs(html)):
            paragraph_obj = dict(index=idx, content=paragraph)
            paragraph_uuid = generate_uuid5(paragraph_obj, "Paragraph")
            batch.add_data_object(data_object=paragraph_obj, class_name="Paragraph", uuid=paragraph_uuid)

            batch.add_reference(article_uuid, "Article", "hasParagraphs", paragraph_uuid, "Paragraph")
            batch.add_reference(article_uuid, "Paragraph", "fromArticle", article_uuid, "Article")

In [36]:
client = weaviate.client.Client(url="http://localhost:8080")

client.schema.delete_all()
schemas = load_schema_classes("../schema")
client.schema.create(schemas)

In [37]:
data_path = Path("../data/articles")
for file in data_path.iterdir():
    if file.name.endswith(".html"):
        html = read_html_from_file(file)
        load_article(client, html)