## Question 1 a

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import time

# Starting variables
base_url = "https://press.un.org/en"
seed_url = "https://press.un.org"
press_release_links = []
crisis_press_releases = []

# Function to extract links from a page
def extract_links(url):
    response = urlopen(url)
    soup = BeautifulSoup(response, "html.parser")
    links = soup.find_all("a", href=True)
    return [link["href"] for link in links]

# Function to check if a page contains the word "crisis"
def contains_crisis(url):
    response = urlopen(url)
    soup = BeautifulSoup(response, "html.parser")
    text = soup.get_text()
    return "crisis" in text.lower()

# Start with the base URL
press_release_links.extend(extract_links(base_url))

# Iterate through the links
for link in press_release_links:
    if link.startswith("/en/") and link.endswith(".doc.htm"):
        full_url = seed_url + link
        try:
            if contains_crisis(full_url):
                crisis_press_releases.append(full_url)
                # Stop when you have exactly 10 press releases
                if len(crisis_press_releases) == 10:
                    break
        except Exception as e:
            print(f"Error: {e}")
        # Introduce a delay between requests to avoid overloading the server
        time.sleep(1)  # You can adjust the delay as needed

# Save the HTML source code of each press release
for index, press_release in enumerate(crisis_press_releases, start=1):
    response = urlopen(press_release)
    html_content = response.read().decode("utf-8")

    # Create the file name using the specified format
    file_name = f"{'1'}_{index}.txt"

    # Save the HTML source code to the file
    with open(file_name, "w", encoding="utf-8") as file:
        file.write(html_content)
        print(f"Saved {file_name}")

# Print the list of press releases containing "crisis"
for index, press_release in enumerate(crisis_press_releases, start=1):
    print(f"Press Release {index}: {press_release}")



## Question 1 b

##### This takes like 1.5 minutes to load

In [2]:
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup

# Base URL template with a placeholder for the page number
base_url_template = "https://www.europarl.europa.eu/news/en/press-room/page/{}"
press_release_links = []
crisis_press_releases = []

# Function to extract links from a page
def extract_links(url):
    try:
        request = Request(url, headers={"User-Agent": "Mozilla/5.0"})
        response = urlopen(request)
        soup = BeautifulSoup(response, "html.parser")
        links = soup.find_all("a", href=True)
        return [link["href"] for link in links]
    except Exception as e:
        print(f"Error in extract_links: {e}")
        return []

# Function to check if a page contains the word "crisis"
def contains_crisis(url):
    try:
        request = Request(url, headers={"User-Agent": "Mozilla/5.0"})
        response = urlopen(request)
        soup = BeautifulSoup(response, "html.parser")
        text = soup.get_text()
        return "crisis" in text.lower()
    except Exception as e:
        print(f"Error in contains_crisis: {e}")
        return False

# Iterate through pages by incrementing page_number
page_number = 0
while True:
    current_url = base_url_template.format(page_number)
    page_links = extract_links(current_url)
    
    if not page_links:
        break  # No more pages to scrape
    
    for link in page_links:
        if contains_crisis(link):
            crisis_press_releases.append(link)
    
    # Increment the page number for the next page
    page_number += 1
    if page_number >= 5: 
         break
            
# Save the HTML source code of each press release
for index, press_release in enumerate(crisis_press_releases, start=1):
    response = urlopen(press_release)
    html_content = response.read().decode("utf-8")

    # Create the file name using the specified format
    file_name = f"{'2'}_{index}.txt"

    # Save the HTML source code to the file
    with open(file_name, "w", encoding="utf-8") as file:
        file.write(html_content)
        print(f"Saved {file_name}")

# Print the list of press releases containing "crisis"
for idx, press_release in enumerate(crisis_press_releases, start=1):
    print(f"Press Release {idx}: {press_release}")


Saved 2_1.txt
Saved 2_2.txt
Saved 2_3.txt
Saved 2_4.txt
Saved 2_5.txt
Saved 2_6.txt
Saved 2_7.txt
Saved 2_8.txt
Saved 2_9.txt
Saved 2_10.txt
Press Release 1: https://www.europarl.europa.eu/news/en/press-room/20231006IPR06504/where-there-is-a-political-will-there-is-a-way-migration-enlargement-budget
Press Release 2: https://www.europarl.europa.eu/news/en/press-room/20230929IPR06132/nagorno-karabakh-meps-demand-review-of-eu-relations-with-azerbaijan
Press Release 3: https://www.europarl.europa.eu/news/en/press-room/20230929IPR06130/parliament-argues-for-a-top-up-to-multi-annual-budget-for-crisis-response
Press Release 4: https://www.europarl.europa.eu/news/en/press-room/20230904IPR04608/spanish-presidency-debriefs-ep-committees-on-priorities
Press Release 5: https://www.europarl.europa.eu/news/en/press-room/20230918IPR05429/meps-argue-for-a-top-up-to-multi-annual-budget-for-crisis-response
Press Release 6: https://www.europarl.europa.eu/news/en/press-room/20230911IPR04923/reduce-demand-