In [40]:
import os
from requests_html import AsyncHTMLSession
import pandas as pd
import re

from helpers import *

In [41]:
asession = AsyncHTMLSession()
SLEEP_TIME = 10
ENGLISH_LAWS_URL = "https://zakon.rada.gov.ua/laws/main/en/llenglaws"
UKRAINIAN_LAWS_URL = "https://zakon.rada.gov.ua/laws/main/a/page"

In [109]:
def save_document(element_list, documents_list_specifier, storage_path):
    documents_list_folder = os.path.join(storage_path, documents_list_specifier)
    create_folder_if_not_exists(documents_list_folder)
    start_index = get_next_index(documents_list_folder)
    document_folder = os.path.join(documents_list_folder, start_index)
    os.mkdir(document_folder)
        
    for _, element in enumerate(element_list):
        if not element.text.strip():
            continue

        if "laws" in documents_list_specifier:
            section_title_element = element.find(".rvts9")

            if section_title_element:
                section_start_index = get_next_index(document_folder)
                section_folder = os.path.join(document_folder, section_start_index)
                os.mkdir(section_folder)
                para_start_index = get_next_index(section_folder)
                file_path = os.path.join(section_folder, f'{para_start_index}.txt')

                with open(file_path, 'w') as f:
                    f.write(element.text)
            else:
                section_start_index = get_next_index(document_folder)   

                if int(section_start_index) == 0:
                    continue 
                
                section_folder = os.path.join(document_folder, str(int(section_start_index) - 1))
                para_start_index = get_next_index(section_folder, split_by_dot=True)
                file_path = os.path.join(section_folder, f'{para_start_index}.txt') 
                with open(file_path, 'w') as f:
                    f.write(element.text)
        else:
            para_start_index = get_next_index(document_folder, split_by_dot=True)
            file_path = os.path.join(document_folder, f'{para_start_index}.txt')

            with open(file_path, 'w') as f:
                    f.write(element.text)

In [83]:
async def retrieve_abstract(url):
    response = await asession.get(url)
    await response.html.arender(sleep=SLEEP_TIME)
    element_list = response.html.find(".rvts0 [align='justify'], .rvts0 ul")
    return element_list

In [84]:
def get_abstract_link_element(page_response):
    return page_response.html.find("[href*='anot']", first=True)

In [85]:
# title and signature - ".rvts0 > .rvps8"
def get_law_element_list(page_response):
    return page_response.html.find(".rvts0 > .rvps2, .rvts0 .rvts15")

In [86]:
async def process_law_and_save(law_element_list, abstract_link_element, row, lang_prefix, storage_path):
    row[f'{lang_prefix}_abstract_url'] = next(iter(abstract_link_element.absolute_links))
    abstract_element_list = await retrieve_abstract(row[f'{lang_prefix}_abstract_url'])

    save_document(law_element_list, f'{lang_prefix}_laws', storage_path)
    save_document(abstract_element_list, f'{lang_prefix}_abstracts', storage_path)

In [103]:
async def retrieve_law_and_abstract_in_en_and_uk(en_law_url, storage_path, lang):
    df = pd.read_csv(os.path.join(storage_path, "df.csv"))

    if df[f'{lang}_law_url'].str.contains(en_law_url).any():
        return

    row = {f'{lang}_law_url': en_law_url}

    en_page_response = await asession.get(en_law_url)
    await en_page_response.html.arender(sleep=SLEEP_TIME)

    en_abstract_link_element = get_abstract_link_element(en_page_response)

    if en_abstract_link_element:
        en_law_element_list = get_law_element_list(en_page_response)

        if en_law_element_list:
            await process_law_and_save(en_law_element_list, en_abstract_link_element, row, 'en', storage_path)

            uk_law_link_element = en_page_response.html.find("[lang='uk']", first=True)
            row['uk_law_url'] = next(iter(uk_law_link_element.absolute_links))
            uk_page_response = await asession.get(row['uk_law_url'])
            await uk_page_response.html.arender(sleep=SLEEP_TIME)
            uk_abstract_link_element = get_abstract_link_element(uk_page_response)

            uk_law_element_list = get_law_element_list(uk_page_response)
            await process_law_and_save(uk_law_element_list, uk_abstract_link_element, row, 'uk', storage_path)
        else:
            row['is_standardized'] = False

    df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)
    df.to_csv(os.path.join(storage_path, "df.csv"), index=False)

In [88]:
async def retrieve_page_from_pagination(url, storage_folder, lang):
    response = await asession.get(url)
    await response.html.arender()
    laws_link_elements_list = response.html.find(".valid")

    for law_link_element in laws_link_elements_list:
        law_url = next(iter(law_link_element.absolute_links))
        await retrieve_law_and_abstract_in_en_and_uk(law_url, storage_folder, lang)

In [89]:
async def retrieve_laws_and_abstracts_from_pagination_pages(pagination_url_list, storage_folder, columns, lang):
    df_path = os.path.join(storage_folder, "df.csv")

    if not os.path.exists(df_path):
        df = pd.DataFrame(columns=columns)
        df.to_csv(df_path, index=False)

    for pagination_url in pagination_url_list:
        await retrieve_page_from_pagination(pagination_url, storage_folder, lang)

In [90]:
async def get_laws_pagination_urls(entry_url, selector):
    laws_pagination_urls = []

    r = await asession.get(entry_url)
    await r.html.arender()

    to_the_end_link_element = r.html.find(f"{selector}[title='в кінець']", first=True)
    
    if to_the_end_link_element:
        to_the_end_url = next(iter(to_the_end_link_element.absolute_links))
        match = re.search(r'page(\d+)$', to_the_end_url)
        max_page_number = int(match.group(1))

        for i in range(1, max_page_number + 1):
            laws_pagination_urls.append(f"{entry_url}/page{i}")
    else:
        pagination_link_element_list = r.html.find(selector)

        for pagination_link_element in pagination_link_element_list:
            title_attr = pagination_link_element.attrs.get('title')

            if title_attr == 'поточна сторінка' or title_attr == 'current page':
                laws_pagination_urls.append(entry_url)
            else:
                pagination_url = next(iter(pagination_link_element.absolute_links))
                laws_pagination_urls.append(pagination_url)

    return laws_pagination_urls

In [91]:
English_laws_pagination_urls = await get_laws_pagination_urls(ENGLISH_LAWS_URL, ".page-link")
English_laws_pagination_urls

['https://zakon.rada.gov.ua/laws/main/en/llenglaws',
 'https://zakon.rada.gov.ua/laws/main/en/llenglaws/page2',
 'https://zakon.rada.gov.ua/laws/main/en/llenglaws/page3',
 'https://zakon.rada.gov.ua/laws/main/en/llenglaws/page4']

In [None]:
Ukranian_laws_pagination_urls_by_year = await get_laws_pagination_urls(UKRAINIAN_LAWS_URL, ".nav-list .btn-link:not(.active)")
Ukranian_laws_pagination_urls_by_year

In [None]:
try:
    laws_with_abstracts_folder = os.path.join(os.path.curdir, "laws_with_abstracts")
    all_ukrainian_legal_documents_folder = os.path.join(laws_with_abstracts_folder, "all_ukrainian_legal_documents")
    create_folder_if_not_exists(all_ukrainian_legal_documents_folder)

    for year_url in Ukranian_laws_pagination_urls_by_year:
        year = year_url[-4:]
        year_folder = os.path.join(all_ukrainian_legal_documents_folder, year)
        create_folder_if_not_exists(year_folder)
        year_pagination_urls = await get_laws_pagination_urls(year_url, ".page-link")
        lang = "uk"
        columns = [f'{lang}_law_url', f'{lang}_abstract_url', 'is_standardized']
        await retrieve_laws_and_abstracts_from_pagination_pages(year_pagination_urls, year_folder, columns, lang)
except:
    print("An exception occurred")

In [110]:
storage_folder = os.path.join(os.path.curdir, "laws_with_abstracts", "split_by_sections")
columns = [f'en_law_url', f'en_abstract_url', 'is_standardized']
await retrieve_laws_and_abstracts_from_pagination_pages(English_laws_pagination_urls, storage_folder, columns, "en")

  df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)
