In [None]:
URL = 'https://faq.bsv.admin.ch/de/familienzulagen/wann-gilt-ein-jugendlicher-als-ausbildung'


In [None]:
''' Extract parsed HTML content from a web page. '''
import requests
from bs4 import BeautifulSoup

def fetch_and_parse_html(url, tag):
    """
    Fetches the HTML content from a given URL and returns a BeautifulSoup object
    for parsing the HTML.

    :param url: The URL of the web page to fetch.
    :return: A BeautifulSoup object representing the parsed HTML content.
    """

    """Extract text from a web page."""
    response = requests.get(url, timeout=10)
    if response.status_code == 200:
        response.encoding = 'utf-8'
        html = response.text
        soup = BeautifulSoup(html, features="html.parser")
        return soup

body = fetch_and_parse_html(URL, 'body')
body


In [None]:
''' Extract language html tag value from a web page. '''
import requests
from bs4 import BeautifulSoup

def extract_language(url):
    """
    Extracts the language from the HTML tag of a webpage.

    :param url: The URL of the webpage to extract the language from.
    :return: The language code (e.g., 'en', 'de') if found, otherwise None.
    """
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, features="html.parser")
            html_tag = soup.find('html')
            return html_tag.get('lang') if html_tag and html_tag.get('lang') else None
    except requests.RequestException as e:
        print(f"Error fetching the URL: {e}")
    return None

# Example usage
language = extract_language(URL)
print(language)


In [None]:
''' Extract category from a web page. '''
import urllib.parse

def extract_category(url, category_position_in_path=2):
    """
    Extracts a category segment from the URL of a webpage.

    The category is determined by splitting the URL's path and selecting a segment
    based on its position.

    :param url: The URL to extract the category from.
    :param category_position_in_path: The position of the segment in the URL path
                                      that is considered the category. Default is 2.
    :return: The extracted category segment if available, otherwise None.
    """
    try:
        parsed_url = urllib.parse.urlparse(url)
        path_segments = parsed_url.path.strip('/').split('/')
        if len(path_segments) >= category_position_in_path:
            return path_segments[category_position_in_path - 1]
    except Exception as e:
        print(f"Error processing the URL: {e}")
    return None

category = extract_category(URL)
category


In [None]:
''' Extract QA pair from a web page. '''
import re
import requests
from bs4 import BeautifulSoup

def extract_and_clean_text(url, tags, remove_patterns=None):
    """
    Extracts text from specified HTML tags of a web page and cleans it based on provided patterns.

    :param url: The URL of the webpage to extract text from.
    :param tags: A single tag or a list of tags to extract text from.
    :param remove_patterns: A list of strings or regex patterns to remove from the extracted text.
    :return: Extracted and cleaned text if available, otherwise an empty string.
    """
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            response.encoding = 'utf-8'
            soup = BeautifulSoup(response.text, features="html.parser")
            for script in soup(['header', 'footer']):
                script.decompose()

            if isinstance(tags, str):
                tags = [tags]

            text_parts = []
            for tag in tags:
                for element in soup.find_all(tag):
                    text = element.get_text()
                    lines = (line.strip() for line in text.splitlines())
                    text_parts.append('\n'.join(line for line in lines if line))

            cleaned_text = '\n\n'.join(text_parts)
            if remove_patterns:
                for pattern in remove_patterns:
                    cleaned_text = re.sub(pattern, '', cleaned_text)

            return cleaned_text
    except requests.RequestException as e:
        print(f"Error fetching the URL: {e}")

    return ''

question = extract_and_clean_text(URL, ['h1'])

remove_list = ['Antwort\n', 'Rispondi\n', 'Réponse\n']
answer = extract_and_clean_text(URL, ['article'], remove_list)

print(f"question: {question}")
print(f"answer: {answer}")


In [None]:
''' Iterate over a list of URLs in a sitemap.xml and
    extract QA pair from a web page and save it to a SQLite database. 
    
    @todo: Add a logic to update the existing records.
'''
import requests
import xml.etree.ElementTree as ET
import sqlite3
from datetime import datetime

def get_current_timestamp():
    """ Gibt den aktuellen Zeitstempel zurück. """
    return datetime.now().strftime("%Y-%m-%d %H:%M:%S")

def get_sitemap_urls(sitemap_url):
    """ Extrahieren Sie URLs aus der Sitemap. """
    try:
        response = requests.get(sitemap_url)
        if response.status_code != 200:
            return []
        sitemap_xml = response.text
        root = ET.fromstring(sitemap_xml)
        namespace = {'sitemap': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
        urls = [url.find("sitemap:loc", namespace).text for url in root]
        return urls
    except requests.RequestException as e:
        print(f"Fehler beim Abrufen der Sitemap: {e}")
        return []

def save_data_to_db(db_connection, data):
    """ Speichern Sie die Daten in einer SQLite-Datenbank. """
    try:
        cursor = db_connection.cursor()
        cursor.execute("""INSERT INTO faq_data (language, category, question, answer, source, created_at, updated_at)
                          VALUES (?, ?, ?, ?, ?, ?, ?)""", data)
        db_connection.commit()
    except sqlite3.DatabaseError as e:
        print(f"Fehler beim Speichern in die Datenbank: {e}")

# SQLite Datenbank initialisieren
conn = sqlite3.connect('/workspaces/b3rn_zero_copilot/vectorstors-container/vectorstors/data/bsv_faq.db')
conn.execute('''CREATE TABLE IF NOT EXISTS faq_data
                 (id INTEGER PRIMARY KEY, language TEXT, category TEXT, question TEXT, answer TEXT, source TEXT, created_at TEXT, updated_at TEXT)''')
conn.commit()

remove_patterns = ['Antwort\n', 'Rispondi\n', 'Réponse\n']
sitemap_url = 'https://faq.bsv.admin.ch/sitemap.xml'
urls = get_sitemap_urls(sitemap_url)

for url in urls:
    try:
        extracted_language = extract_language(url)
        extracted_category = extract_category(url)
        extracted_h1 = extract_and_clean_text(url, 'h1', remove_patterns)
        extracted_article = extract_and_clean_text(url, 'article', remove_patterns)
    
        if extracted_h1 and extracted_language in ['de', 'it', 'fr', 'en']: 
            timestamp = get_current_timestamp()
            data = (extracted_language, extracted_category, extracted_h1, extracted_article, url, timestamp, timestamp)
            save_data_to_db(conn, data)
    except Exception as e:
        print(f"Fehler bei der Verarbeitung der URL {url}: {e}")

conn.close()
print("Fertig! Alle Seiten wurden verarbeitet und in die SQLite-Datenbank gespeichert.")
