In [None]:
!pip3 install selenium webdriver-manager

# Extract data and save to JSON files

Save each country (first-level page) in a JSON file, including all topics (second-level page) and their text (third-level page)


In [None]:
from selenium import webdriver
from selenium.webdriver import FirefoxOptions
from webdriver_manager.firefox import GeckoDriverManager
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from bs4 import BeautifulSoup
from time import sleep
import random
from tqdm.notebook import tqdm
import json
import os


# Create WebDriver and open the initial page
opts = FirefoxOptions()
opts.add_argument("--headless")
browser = webdriver.Firefox(service=Service(GeckoDriverManager().install()), options=opts)
browser.set_page_load_timeout(60)
browser.get("https://uahelpinfo.notion.site/uahelpinfo/UAhelpinfo-70c556bf892748299fe747d95c1b8aa0")
WebDriverWait(browser, 10).until(EC.title_contains("UAhelpinfo"))
print(browser.title)

# Extract links
def get_links(browser):
    # Sleep for a random time between 2 and 5
    sleep(round(random.uniform(2, 5), 1))
    
    # Check if links with class="pseudoSelection" are available and extract them
    links = WebDriverWait(browser, 10).until(
        EC.presence_of_all_elements_located((By.XPATH, '//div[@class="pseudoSelection"]//a')))
    return links

def get_text(browser):
    # Sleep for a random time between 2 and 5
    sleep(round(random.uniform(2, 5), 1))
    
    # Check if links with titles (class="Heading 2") and text are available and extract them
    texts_elements = WebDriverWait(browser, 10).until(
        EC.presence_of_all_elements_located(
            (By.XPATH, '//div[@placeholder="Heading 2" or @data-content-editable-leaf="true"]'))
    )
    text = []
    for text_element in texts_elements:
        if text_element.get_attribute('placeholder') == "Heading 2":
            text.append("• " + text_element.text)
        else:
            text.append(text_element.text)
    return text


# Create the output directory if does not exist
output_dir = "output"
if os.path.exists(output_dir) == False:
    os.makedirs(output_dir)

# To continue previous scraping, check existing files
existing = [file[:-5] for file in os.listdir(output_dir)]

# Get country links
country_links = get_links(browser)
# Extract hrefs
links_hrefs = [(country_link.text, country_link.get_attribute('href')) for country_link in country_links]


# From each country page, extract topic links
href_bar = tqdm(links_hrefs, total=len(links_hrefs))
for country, link_href in href_bar:
    if country in existing:
        continue
    else:
        href_bar.set_description(country)

        country_output = []

        # Open the country link
        browser.get(link_href)
        # Get topic links
        topic_links = get_links(browser)
        # Extract hrefs
        topic_links_hrefs = [(topic_link.text, topic_link.get_attribute('href')) for topic_link in topic_links]

        # From each topic page, extract text
        for topic, topic_link_href in topic_links_hrefs:
            topic_dict = {}

            # Open the topic link
            browser.get(topic_link_href)
            # Extract the text
            text = get_text(browser)
            # Remove unrequired texgt
            text = [p.strip() for p in text if len(p.strip()) > 0 and p.strip().endswith("На початок")==False][:-1]
            # Join the list of text portions
            text = "\n".join(text)
            # Add to the topic dictionary
            topic_dict["topic"] = topic
            topic_dict["text"] = text
            topic_dict["url"] = topic_link_href

            country_output.append(topic_dict)

        # Save the output to a file
        output_path = os.path.join(output_dir, country+".json")

        with open(output_path, "w+") as json_output:
            output = json.dumps(country_output, indent=4, ensure_ascii=False)
            json_output.write(str(output))


In [1]:
# Extract all Education pages on one JSON

import json
import os

output_dir = "output"
json_files = [file_name for file_name in os.listdir(output_dir) if file_name.endswith(".json")]

education_items = []

for json_file in json_files:
    with open(os.path.join(output_dir,json_file)) as json_input:
        json_content = json.load(json_input)
        for item in json_content:
            if item["topic"].startswith("Освіта"):
                item["Country"] = json_file[:-5]
                education_items.append(item)

with open(os.path.join(output_dir,"Education.json"), "w+") as json_edu_output:
    output = json.dumps(education_items, indent=4, ensure_ascii=False)
    json_edu_output.write(str(output))