## Question 1-1

In [8]:
import requests
from bs4 import BeautifulSoup
import time

def find_press_releases(seed_url, keyword, limit=10):
    press_releases = []
    to_visit = [seed_url]
    visited = set()

    while to_visit and len(press_releases) < limit:
        url = to_visit.pop(0)
        if url in visited:
            continue  
        visited.add(url)

        time.sleep(1)  
        response = requests.get(url)
        if response.status_code != 200:
            continue

        soup = BeautifulSoup(response.text, 'html.parser')
        press_release_tag = soup.find('a', href='/en/press-release', hreflang='en')
        if press_release_tag:
            if keyword.lower() in soup.get_text().lower():
                press_releases.append(url)
                print(f"Found URL: {url}")

        for link in soup.find_all('a', href=True):
            link_url = link.get('href')
            if link_url.startswith('/en/'):
                absolute_url = 'https://press.un.org' + link_url
                if absolute_url not in visited:
                    to_visit.append(absolute_url)

    return press_releases

seed_url = 'https://press.un.org/en'
keyword = 'crisis'
limit = 10
press_releases = find_press_releases(seed_url, keyword, limit)
print("List of URLs:", press_releases)


Found URL: https://press.un.org/en/2023/sgsm21980.doc.htm
Found URL: https://press.un.org/en/2023/sgsm21978.doc.htm
Found URL: https://press.un.org/en/2023/sgsm21947.doc.htm
Found URL: https://press.un.org/en/2023/dsgsm1874.doc.htm
Found URL: https://press.un.org/en/2023/sgsm21952.doc.htm
Found URL: https://press.un.org/en/2023/sgsm21876.doc.htm
Found URL: https://press.un.org/en/2023/sgsm21852.doc.htm
Found URL: https://press.un.org/en/2023/sgsm21806.doc.htm
Found URL: https://press.un.org/en/2023/dsgsm1848.doc.htm
Found URL: https://press.un.org/en/2023/sgsm21765.doc.htm
List of URLs: ['https://press.un.org/en/2023/sgsm21980.doc.htm', 'https://press.un.org/en/2023/sgsm21978.doc.htm', 'https://press.un.org/en/2023/sgsm21947.doc.htm', 'https://press.un.org/en/2023/dsgsm1874.doc.htm', 'https://press.un.org/en/2023/sgsm21952.doc.htm', 'https://press.un.org/en/2023/sgsm21876.doc.htm', 'https://press.un.org/en/2023/sgsm21852.doc.htm', 'https://press.un.org/en/2023/sgsm21806.doc.htm', 'http

## Question 1-2

In [2]:
import requests
from bs4 import BeautifulSoup
import re
import time

def find_press_releases(seed_url, keyword, limit=10):
    press_releases = set()
    crawled_pages = set()
    page_number = 0

    def get_absolute_url(base_url, link):
        if link.startswith('/'):
            return 'https://www.europarl.europa.eu' + link
        else:
            return link

    while len(press_releases) < limit:
        current_url = f"{seed_url}/{page_number}"
        if current_url in crawled_pages:
            continue
        crawled_pages.add(current_url)
        page_number += 1

        response = requests.get(current_url)
        soup = BeautifulSoup(response.text, 'html.parser')

        links = soup.select('a[href*="press-room"]')

        for link in links:
            absolute_url = get_absolute_url(seed_url, link['href'])

            if absolute_url in crawled_pages or '/en/' not in absolute_url:
                continue

            if not re.search(r'/press-room/\d{8}IPR\d+', absolute_url):
                continue

            press_content_response = requests.get(absolute_url)
            press_content_soup = BeautifulSoup(press_content_response.text, 'html.parser')

            is_plenary_session = press_content_soup.find('span', class_='ep_name', string='Plenary session')
            press_content_text = press_content_response.text.lower()
            if is_plenary_session and keyword.lower() in press_content_text:
                press_releases.add(absolute_url)
                print(f"Found URL: {absolute_url}")

                if len(press_releases) >= limit:
                    return list(press_releases)

        time.sleep(1)

seed_url = 'https://www.europarl.europa.eu/news/en/press-room/page'
keyword = 'crisis'
limit = 10

press_releases = find_press_releases(seed_url, keyword, limit)
print("List of URLs:", press_releases)


Found URL: https://www.europarl.europa.eu/news/en/press-room/20230929IPR06132/nagorno-karabakh-meps-demand-review-of-eu-relations-with-azerbaijan
Found URL: https://www.europarl.europa.eu/news/en/press-room/20230929IPR06130/parliament-argues-for-a-top-up-to-multi-annual-budget-for-crisis-response
Found URL: https://www.europarl.europa.eu/news/en/press-room/20230911IPR04923/reduce-demand-and-protect-people-in-prostitution-say-meps
Found URL: https://www.europarl.europa.eu/news/en/press-room/20230911IPR04918/svietlana-tsikhanouskaya-to-meps-support-belarusians-european-aspirations
Found URL: https://www.europarl.europa.eu/news/en/press-room/20230911IPR04908/meps-vote-to-strengthen-eu-defence-industry-through-common-procurement
Found URL: https://www.europarl.europa.eu/news/en/press-room/20230707IPR02427/covid-19-parliament-adopts-roadmap-to-better-prepare-for-future-health-crises
Found URL: https://www.europarl.europa.eu/news/en/press-room/20230707IPR02421/parliament-adopts-new-rules-to-