In [16]:
def make_url(offset):
    return f"https://lovdata.no/register/avgjørelser?verdict=TRA&offset={offset}#doclistheader"

In [17]:
from bs4 import BeautifulSoup as bs
import requests

In [70]:
# gather 100 documents:

def get_documents(start_idx, end_idx):
    if end_idx < start_idx:
        raise ValueError("end_idx must be greater than start_idx")

    # if startidx is 100, and end_idx is 200, we want to get 100 documents
    # that is, using the offset of 20
    # we need to define the number of iterations 
    num_iter = (end_idx - start_idx) // 20
    # the start_idx defines our starting offset
    start_offset = start_idx // 20
    print(f"Iterating {num_iter} times, starting at offset {start_offset}")
    documents = []

    for i in range(num_iter):
        offset = (start_offset + i) * 20
        res = requests.get(make_url(offset))
        print(res.url)
        soup = bs(res.text, 'html.parser')
        _d = soup.find_all('a', href=lambda href: href and '/dokument/' in href)
        # just extract the href
        _d = [d['href'] for d in _d]
        print(f"found {len(_d)} documents")
        documents.extend(_d)
        offset += 20
    return documents

documents = get_documents(100, 200)

Iterating 5 times, starting at offset 5
https://lovdata.no/register/avgj%C3%B8relser?verdict=TRA&offset=100#doclistheader
found 20 documents
https://lovdata.no/register/avgj%C3%B8relser?verdict=TRA&offset=120#doclistheader
found 20 documents
https://lovdata.no/register/avgj%C3%B8relser?verdict=TRA&offset=140#doclistheader
found 20 documents
https://lovdata.no/register/avgj%C3%B8relser?verdict=TRA&offset=160#doclistheader
found 20 documents
https://lovdata.no/register/avgj%C3%B8relser?verdict=TRA&offset=180#doclistheader
found 20 documents


In [51]:
import unicodedata
import re

def make_case_url(id):
    return f"https://lovdata.no{id}"

def get_unescaped_text(tag):
    return unicodedata.normalize("NFKD", tag.text)

def extract_div_data(div):
    current_paragraph = []

    # find any h-tag, such as h1, h2, h3,...
    htags = div.find_all(re.compile('^h[1-6]$'))
    if len(htags) == 0:
        title = "Missing title"
    else:
        title = htags[0].text

    for p in div.find_all('p'):
        if len(p.text) > 0:
            current_paragraph.append(get_unescaped_text(p))
    
    return {
        "title": title,
        "content": current_paragraph
    }
    

def extract_case_data(id):
    url = make_case_url(id)
    print(url)
    res = requests.get(url)
    soup = bs(res.text, 'html.parser')
    # get the div id="documentBody"
    doc_body = soup.find('div', id='documentBody')
    # for each <div> with the attribute data-level, create a new paragraph
    # all tags occurring before the first <div> are considered introduction material
    introduction = []
    paragraphs = {}

    if not doc_body:
        return

    doc_body_children = doc_body.find_all(recursive=False)
    for child in doc_body_children:
        if child.name != 'div':
            if len(child.text) > 0:
                introduction.append(get_unescaped_text(child))
        else:
            # look one layer further, in case there's divs under this div:
            if not child:
                continue
            subdivs = child.find_all(recursive=False)
            subdivs = [s for s in subdivs if s.name == 'div']
            for s in subdivs:
                data_id = s.get('data-id', s.get('id', 'no-id'))
                paragraphs[data_id] = extract_div_data(s)
            else:
                data_id = child.get('data-id', child.get('id', 'no-id'))
                paragraphs[data_id] = extract_div_data(child)

    return {
        "id": id,
        "introduction": introduction,
        "paragraphs": paragraphs,
    }

In [72]:
documents[0]

'/dokument/TRSTR/avgjorelse/ttel-2022-148789'

In [74]:
import os
import json
from datetime import datetime

date_str = datetime.now().strftime("%Y_%b_%d")
lovdata_gathering = f"../data/lovdata/scraped/{date_str}"
os.makedirs(lovdata_gathering, exist_ok=True)

# get 100 documents from the starting position 0 to 100 (5 offset iterations)
documents = get_documents(0, 300)
for case in documents:
    case_id = case.split('/')[-1]
    doc_path = os.path.join(lovdata_gathering, f"{case_id}.json")
    data = extract_case_data(case)
    if data:
        with open(doc_path, 'w', encoding="utf-8") as f:
            f.write(json.dumps(data, ensure_ascii=False))


Iterating 15 times, starting at offset 0
https://lovdata.no/register/avgj%C3%B8relser?verdict=TRA&offset=0#doclistheader
found 20 documents
https://lovdata.no/register/avgj%C3%B8relser?verdict=TRA&offset=20#doclistheader
found 20 documents
https://lovdata.no/register/avgj%C3%B8relser?verdict=TRA&offset=40#doclistheader
found 20 documents
https://lovdata.no/register/avgj%C3%B8relser?verdict=TRA&offset=60#doclistheader
found 20 documents
https://lovdata.no/register/avgj%C3%B8relser?verdict=TRA&offset=80#doclistheader
found 20 documents
https://lovdata.no/register/avgj%C3%B8relser?verdict=TRA&offset=100#doclistheader
found 20 documents
https://lovdata.no/register/avgj%C3%B8relser?verdict=TRA&offset=120#doclistheader
found 20 documents
https://lovdata.no/register/avgj%C3%B8relser?verdict=TRA&offset=140#doclistheader
found 20 documents
https://lovdata.no/register/avgj%C3%B8relser?verdict=TRA&offset=160#doclistheader
found 20 documents
https://lovdata.no/register/avgj%C3%B8relser?verdict=TRA