In [1]:
### Set-up
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from simplified_scrapy.simplified_doc import SimplifiedDoc

### Q1 - 1

In [2]:
### To find the relative press-release page

# The original URL
base_url = 'https://press.un.org/en'

# Fetch the webpage
response = requests.get(base_url)

# Check if request was successful
if response.status_code == 200:
    
    # Parse the webpage
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all links (internal and external) that contain "press-release"
    press_release_links = [
        a['href'] for a in soup.find_all('a', href=True) 
        if "press-release" in a['href']
    ]

    # Output the links
    for link in press_release_links:
        print(link)
else:
    pass

/en/content/secretary-general/press-release
/en/content/general-assembly/press-release
/en/content/security-council/press-release
/en/content/economic-and-social-council/press-release
https://www.icj-cij.org/en/press-releases
https://press.un.org/en/content/press-release
/en/content/press-release


#### From above results, we notice that the link https://press.un.org/en/content/press-release are the location with all the press-release. Then we do the web scriping from https://press.un.org/en/content/press-release. 
#### Also when check that website, we should notice that the URL for each page follows a predictable pattern that appending a page number at the end (like https://press.un.org/en/content/press-release?page=1), we will use this pattern to update the page.

In [3]:
### fetch child website from above link

def get_press_releases_with_crisis(max_page_number=10):
    url_pattern = 'https://press.un.org/en/content/press-release?page={}'
    press_releases_with_crisis = []
    
    for page_number in range(1, max_page_number + 1):
        page_url = url_pattern.format(page_number)
        response = requests.get(page_url)
        
        ### fetch child links from each page
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            press_release_candidate_links = soup.find_all('a', href=True)
    
            for link in press_release_candidate_links:
                press_release_url = urljoin('https://press.un.org', link['href'])

                try:
                    press_release_response = requests.get(press_release_url)
                    
                except requests.RequestException as e:
                    continue
                
                ### Check the anchor
                if press_release_response.status_code == 200:
                    press_release_soup = BeautifulSoup(press_release_response.text, 'html.parser')
                    specific_anchor = press_release_soup.find('a', {'href': '/en/press-release', 'hreflang': 'en'})
                    
                    ### Check the word 'crisis'
                    if specific_anchor and "crisis" in press_release_soup.get_text().lower():
                        press_releases_with_crisis.append(press_release_url)
                        
                        ### Stop when we have 10
                        if len(press_releases_with_crisis) >= 10:
                            break
                            
            ### Stop when we have 10
            if len(press_releases_with_crisis) >= 10:
                break
                
        else:
            print(f"Failed to retrieve content from {page_url}")
    
    return press_releases_with_crisis


### function to save the website as txt
def save_press_releases(press_releases):
    for num, url in enumerate(press_releases, start=1):
        response = requests.get(url)
        filename = f"1_{num}.txt"
        with open(filename, 'w', encoding='utf-8') as file:
            file.write(response.text)
        print(f"Saved: {url} as 1_{num}.txt")

In [4]:
# Usage
press_releases_with_crisis = get_press_releases_with_crisis(10)
save_press_releases(press_releases_with_crisis)



Saved: https://press.un.org/en/2023/sgsm21967.doc.htm as 1_1.txt
Saved: https://press.un.org/en/2023/dsgsm1877.doc.htm as 1_2.txt
Saved: https://press.un.org/en/2023/sgsm21959.doc.htm as 1_3.txt
Saved: https://press.un.org/en/2023/sgsm21956.doc.htm as 1_4.txt
Saved: https://press.un.org/en/2023/sgsm21952.doc.htm as 1_5.txt
Saved: https://press.un.org/en/2023/sgsm21951.doc.htm as 1_6.txt
Saved: https://press.un.org/en/2023/sgsm21950.doc.htm as 1_7.txt
Saved: https://press.un.org/en/2023/sgsm21947.doc.htm as 1_8.txt
Saved: https://press.un.org/en/2023/sgsm21945.doc.htm as 1_9.txt
Saved: https://press.un.org/en/2023/dsgsm1874.doc.htm as 1_10.txt


### Q1 - 2

In [5]:
### fetch all links from the page
def crawl_page(base_url, page_number):
    page_url = f"{base_url}{page_number}"
    source_code = requests.get(page_url)
    doc = SimplifiedDoc(source_code.content.decode('utf-8'))
    lst = doc.listA(url=page_url)
    return [a['url'] for a in lst]

### check the archor
def is_press_release(url):
    source_code = requests.get(url)
    return '<span class="ep_name">Plenary session</span>' in source_code.text

### check the word 
def contains_word(url, word):
    source_code = requests.get(url)
    return word.lower() in source_code.text.lower()

### go through all links 
def find_press_releases_with_word(base_url, word, max_results=10):
    press_releases_with_word = []
    page_number = 0
    
    while len(press_releases_with_word) < max_results:

        urls = crawl_page(base_url, page_number)
        press_release_urls = [url for url in urls if is_press_release(url)]
        
        for url in press_release_urls:
            if len(press_releases_with_word) >= max_results:
                break
            if contains_word(url, word):
                press_releases_with_word.append(url)
                print(f"Found '{word}' in press release: {url}")
        
        page_number += 1
        
    return press_releases_with_word[:max_results]

In [6]:
# Usage
base_url = 'https://www.europarl.europa.eu/news/en/press-room/page/'
word = 'crisis'
press_releases = find_press_releases_with_word(base_url, word, 10)

num = 1
for url in press_releases:

    response = requests.get(url)
    filename = f"2_{num}.txt"
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(response.text)
    num += 1

Found 'crisis' in press release: https://www.europarl.europa.eu/news/en/press-room/20230929ipr06132/nagorno-karabakh-meps-demand-review-of-eu-relations-with-azerbaijan
Found 'crisis' in press release: https://www.europarl.europa.eu/news/en/press-room/20230929ipr06130/parliament-argues-for-a-top-up-to-multi-annual-budget-for-crisis-response
Found 'crisis' in press release: https://www.europarl.europa.eu/news/en/press-room/20230911ipr04923/reduce-demand-and-protect-people-in-prostitution-say-meps
Found 'crisis' in press release: https://www.europarl.europa.eu/news/en/press-room/20230911ipr04918/svietlana-tsikhanouskaya-to-meps-support-belarusians-european-aspirations
Found 'crisis' in press release: https://www.europarl.europa.eu/news/en/press-room/20230911ipr04908/meps-vote-to-strengthen-eu-defence-industry-through-common-procurement
Found 'crisis' in press release: https://www.europarl.europa.eu/news/en/press-room/20230707ipr02427/covid-19-parliament-adopts-roadmap-to-better-prepare-fo