In [1]:
def fetch_UN_press_release(keyword, maxNumUrl):
    # Function: Returns United Nation Press Release with a specific keyword and amount
    # @param keyword: The word that is necessary in the press release, e.g., crisis
    # @param maxNumUrl: The number of press releases that are expected
    # Return: No return. Prints the header and content of the press releases
    from bs4 import BeautifulSoup
    import urllib.request
    #from urllib.request import Request

    seed_url = "https://press.un.org/en"

    urls = [seed_url]    # Queue of urls to crawl
    seen = [seed_url]    # Stack of urls seen so far. we keep track of seen urls so that we don't revisit them
    press_releases = []  # Link of press releases
    press_releases_header = []
    press_releases_content = []


    print("Starting with url = " + seed_url)
    while len(urls) > 0 and len(press_releases) < maxNumUrl:
        # DEQUEUE A URL FROM urls AND TRY TO OPEN AND READ IT
        try:
            curr_url = urls.pop(0)
            print("num. of URLs in stack: %d " % len(urls))
            print("Accessing = " + curr_url)
            req = urllib.request.Request(curr_url,headers={'User-Agent': 'Mozilla/5.0'})
            webpage = urllib.request.urlopen(req).read()
            seen.append(curr_url)

        except Exception as ex:
            print("Unable to access = " + curr_url)
            print(ex)
            continue    #skip code below

        # IF URL OPENS, CHECK WHICH URLS THE PAGE CONTAINS
        # ADD THE URLS FOUND TO THE QUEUE url AND seen
        soup = BeautifulSoup(webpage)  #creates object soup
        # Put child URLs into the stack

        whether_press_release = False
        for tag in soup.find_all('a', href = True): #find tags with links
            childUrl = tag['href'] #extract just the link
            if childUrl == "/en/press-release":
                whether_press_release = True
                print("***This is a press release***")
            o_childurl = childUrl
            childUrl = urllib.parse.urljoin(seed_url, childUrl)
            if seed_url in childUrl and childUrl not in seen:
                urls.append(childUrl)
                seen.append(childUrl)
        if whether_press_release:
            header = soup.find('h1', {'class' : 'page-header'}).text
            content = ""
            for paragraph in soup.find('div', {'class' : 'field field--name-body field--type-text-with-summary field--label-hidden field__item'}).find_all('p'):
                content += paragraph.text + "\n\n"
            if keyword in content.lower():
                press_releases.append(curr_url)
                press_releases_header.append(header)
                press_releases_content.append(content)
                with open("1_" + str(len(press_releases)) + ".txt", "w") as f:
                    f.write(str(soup))
                print("***This is a press release, and contain the word 'crisis'***")
                print("Saved to: ./1_" + str(len(press_releases)) + ".txt")

    print("Num of press releases fetched:", len(press_releases))
    for i in range(len(press_releases)):
        print("------------ Press Release " + str(i + 1) + ":", press_releases[i], "------------")
        print("Header:", press_releases_header[i])
        print("Content:", press_releases_content[i])

In [2]:
# Takes ~1min to finish
fetch_UN_press_release("crisis", 10)

Starting with url = https://press.un.org/en
num. of URLs in stack: 0 
Accessing = https://press.un.org/en
num. of URLs in stack: 45 
Accessing = https://press.un.org/en#main-content
num. of URLs in stack: 44 
Accessing = https://press.un.org/en/content/secretary-general
num. of URLs in stack: 61 
Accessing = https://press.un.org/en/content/secretary-general/press-release
num. of URLs in stack: 61 
Accessing = https://press.un.org/en/content/secretary-general/press-conference
num. of URLs in stack: 71 
Accessing = https://press.un.org/en/content/general-assembly
num. of URLs in stack: 90 
Accessing = https://press.un.org/en/content/general-assembly/meetings-coverage
num. of URLs in stack: 94 
Accessing = https://press.un.org/en/content/general-assembly/press-release
num. of URLs in stack: 100 
Accessing = https://press.un.org/en/content/security-council
num. of URLs in stack: 103 
Accessing = https://press.un.org/en/content/security-council/meetings-coverage
num. of URLs in stack: 107 




num. of URLs in stack: 307 
Accessing = https://press.un.org/en/2023/sgsm21964.doc.htm
***This is a press release***
num. of URLs in stack: 310 
Accessing = https://press.un.org/en/2023/sgsm21963.doc.htm
***This is a press release***
num. of URLs in stack: 313 
Accessing = https://press.un.org/en/2023/sgsm21962.doc.htm
***This is a press release***
num. of URLs in stack: 317 
Accessing = https://press.un.org/en?page=0
num. of URLs in stack: 316 
Accessing = https://press.un.org/en?page=1
num. of URLs in stack: 315 
Accessing = https://press.un.org/en?page=2
num. of URLs in stack: 314 
Accessing = https://press.un.org/en?page=3
num. of URLs in stack: 313 
Accessing = https://press.un.org/en?page=4
num. of URLs in stack: 312 
Accessing = https://press.un.org/en?page=5
num. of URLs in stack: 311 
Accessing = https://press.un.org/en?page=6
num. of URLs in stack: 310 
Accessing = https://press.un.org/en?page=7
num. of URLs in stack: 309 
Accessing = https://press.un.org/en?page=8
num. of UR

num. of URLs in stack: 616 
Accessing = https://press.un.org/en/2023/ecosoc7144.doc.htm
num. of URLs in stack: 620 
Accessing = https://press.un.org/en/2023/ecosoc7143.doc.htm
num. of URLs in stack: 623 
Accessing = https://press.un.org/en/2023/dsgsm1860.doc.htm
***This is a press release***
num. of URLs in stack: 626 
Accessing = https://press.un.org/en/2023/ecosoc7142.doc.htm
num. of URLs in stack: 631 
Accessing = https://press.un.org/en/2023/ecosoc7141.doc.htm
num. of URLs in stack: 634 
Accessing = https://press.un.org/en/2023/sgsm21882.doc.htm
***This is a press release***
num. of URLs in stack: 637 
Accessing = https://press.un.org/en/2023/ecosoc7140.doc.htm
num. of URLs in stack: 640 
Accessing = https://press.un.org/en/2023/ecosoc7139.doc.htm
num. of URLs in stack: 643 
Accessing = https://press.un.org/en/2023/sgsm21876.doc.htm
***This is a press release***
***This is a press release, and contain the word 'crisis'***
Saved to: ./1_6.txt
num. of URLs in stack: 646 
Accessing = 

In [3]:
def fetch_European_Parliament_press_release(keyword, maxNumUrl):
    # Function: Returns European Parliament Press Release with a specific keyword and amount
    # @param keyword: The word that is necessary in the press release, e.g., crisis
    # @param maxNumUrl: The number of press releases that are expected
    # Return: No return. Prints the header and content of the press releases
    from bs4 import BeautifulSoup
    import urllib.request
    #from urllib.request import Request

    seed_url = "https://www.europarl.europa.eu/news/en/press-room"

    urls = [seed_url]    # Queue of urls to crawl
    seen = [seed_url]    # Stack of urls seen so far. we keep track of seen urls so that we don't revisit them
    press_releases = []  # Link of press releases
    press_releases_header = []
    press_releases_content = []


    print("Starting with url = " + seed_url)
    while len(urls) > 0 and len(press_releases) < maxNumUrl:
        # DEQUEUE A URL FROM urls AND TRY TO OPEN AND READ IT
        try:
            curr_url = urls.pop(0)
            print("num. of URLs in stack: %d " % len(urls))
            print("Accessing = " + curr_url)
            req = urllib.request.Request(curr_url,headers={'User-Agent': 'Mozilla/5.0'})
            webpage = urllib.request.urlopen(req).read()
            seen.append(curr_url)

        except Exception as ex:
            print("Unable to access = " + curr_url)
            print(ex)
            continue    #skip code below

        # IF URL OPENS, CHECK WHICH URLS THE PAGE CONTAINS
        # ADD THE URLS FOUND TO THE QUEUE url AND seen
        soup = BeautifulSoup(webpage)  #creates object soup
        # Put child URLs into the stack

        for tag in soup.find_all('a', href = True): #find tags with links
            childUrl = tag['href'] #extract just the link
            o_childurl = childUrl
            childUrl = urllib.parse.urljoin(seed_url, childUrl)
            if seed_url in childUrl and childUrl not in seen:
                urls.append(childUrl)
                seen.append(childUrl)
        
        if curr_url == seed_url:
            continue
        whether_press_release = False
        for tag in soup.find_all('span', {'class' : 'ep_name'}):
            ep_name = tag.text
            # Add one more condition as the non-press-releases would also have <span class="ep_name">Plenary session</span>
            if ep_name == "Plenary session" and soup.find('article', {'id': 'website-body'}):
                whether_press_release = True
                print("***This is a press release***")
                break
        if whether_press_release:
            header = soup.find('h1', {'class' : 'ep_title'}).find('div', {'class' : 'ep-p_text'}).find('span', {'class' : 'ep_name'}).text
            content = ""
            for paragraph in soup.find_all('p', {'class' : 'ep-wysiwig_paragraph'}):
                content += paragraph.text + "\n\n"
            if keyword in content.lower():
                press_releases.append(curr_url)
                press_releases_header.append(header)
                press_releases_content.append(content)
                with open("2_" + str(len(press_releases)) + ".txt", "w") as f:
                    f.write(str(soup))
                print("***This is a press release, and contain the word 'crisis'***")
                print("Saved to: ./2_" + str(len(press_releases)) + ".txt")

    print("Num of press releases fetched:", len(press_releases))
    for i in range(len(press_releases)):
        print("------------ Press Release " + str(i + 1) + ":", press_releases[i], "------------")
        print("Header:", press_releases_header[i])
        print("Content:", press_releases_content[i])

In [4]:
# Takes ~3mins to finish
fetch_European_Parliament_press_release("crisis", 10)

Starting with url = https://www.europarl.europa.eu/news/en/press-room
num. of URLs in stack: 0 
Accessing = https://www.europarl.europa.eu/news/en/press-room
num. of URLs in stack: 28 
Accessing = https://www.europarl.europa.eu/news/en/press-room#website-body
num. of URLs in stack: 27 
Accessing = https://www.europarl.europa.eu/news/en/press-room#language-select
num. of URLs in stack: 26 
Accessing = https://www.europarl.europa.eu/news/en/press-room#search-field
num. of URLs in stack: 25 
Accessing = https://www.europarl.europa.eu/news/en/press-room/accreditation
num. of URLs in stack: 28 
Accessing = https://www.europarl.europa.eu/news/en/press-room/press-tool-kit
num. of URLs in stack: 39 
Accessing = https://www.europarl.europa.eu/news/en/press-room/contacts
num. of URLs in stack: 49 
Accessing = https://www.europarl.europa.eu/news/en/press-room/20230929IPR06141/hearing-of-commissioner-designate-wopke-hoekstra
num. of URLs in stack: 54 
Accessing = https://www.europarl.europa.eu/new

num. of URLs in stack: 106 
Accessing = https://api.whatsapp.com/send?text=https://www.europarl.europa.eu/news/en/press-room/press-tool-kit?xtor%3DAD-78-%5BSocial_share_buttons%5D-%5Bwhatsapp%5D-%5Ben%5D-%5Bnews%5D-%5Bpress-kit-2021%5D-%5Bhome_press-kit-2021%5D%26
num. of URLs in stack: 105 
Accessing = https://www.europarl.europa.eu/news/en/press-room/press-tool-kit/0/a-brief-guide-to-the-european-parliament
num. of URLs in stack: 112 
Accessing = https://www.europarl.europa.eu/news/en/press-room/press-tool-kit/1/democracy-in-action-parliament-s-priorities-and-achievements
num. of URLs in stack: 259 
Accessing = https://www.europarl.europa.eu/news/en/press-room/press-tool-kit/2/how-to-follow-parliament-s-work
num. of URLs in stack: 262 
Accessing = https://www.europarl.europa.eu/news/en/press-room/press-tool-kit/3/surveys-in-all-member-states
num. of URLs in stack: 265 
Accessing = https://www.europarl.europa.eu/news/en/press-room/press-tool-kit/4/european-elections-results-1979-2019


num. of URLs in stack: 327 
Accessing = https://twitter.com/intent/tweet?text=Media%20Freedom%20Act:%20press%20conference%20on%20Parliament%E2%80%99s%20demands&url=https://www.europarl.europa.eu/news/en/press-room/20231002IPR06202/media-freedom-act-press-conference-on-parliament-s-demands?xtor%3DAD-78-%5BSocial_share_buttons%5D-%5Btwitter%5D-%5Ben%5D-%5Bnews%5D-%5Bpressroom%5D-%5Bmedia-freedom-media-advisory%5D-&via=Europarl_EN
Unable to access = https://twitter.com/intent/tweet?text=Media%20Freedom%20Act:%20press%20conference%20on%20Parliament%E2%80%99s%20demands&url=https://www.europarl.europa.eu/news/en/press-room/20231002IPR06202/media-freedom-act-press-conference-on-parliament-s-demands?xtor%3DAD-78-%5BSocial_share_buttons%5D-%5Btwitter%5D-%5Ben%5D-%5Bnews%5D-%5Bpressroom%5D-%5Bmedia-freedom-media-advisory%5D-&via=Europarl_EN
HTTP Error 302: The HTTP server returned a redirect error that would lead to an infinite loop.
The last 30x error message was:
Found
num. of URLs in stack: 3

Unable to access = https://twitter.com/intent/tweet?text=Media%20Advisory:%20Hearing%20of%20Commissioner-designate%20Wopke%20Hoekstra%20on%202%20October%20%20&url=https://www.europarl.europa.eu/news/en/press-room/20230925IPR05920/media-advisory-hearing-of-commissioner-designate-wopke-hoekstra-on-2-october?xtor%3DAD-78-%5BSocial_share_buttons%5D-%5Btwitter%5D-%5Ben%5D-%5Bnews%5D-%5Bpressroom%5D-%5Bhearing-of-commissioner-designate-hoekstra%5D-&via=Europarl_EN
HTTP Error 302: The HTTP server returned a redirect error that would lead to an infinite loop.
The last 30x error message was:
Found
num. of URLs in stack: 310 
Accessing = https://www.linkedin.com/shareArticle?mini=true&url=https://www.europarl.europa.eu/news/en/press-room/20230925IPR05920/media-advisory-hearing-of-commissioner-designate-wopke-hoekstra-on-2-october?xtor%3DAD-78-%5BSocial_share_buttons%5D-%5Blinkedin%5D-%5Ben%5D-%5Bnews%5D-%5Bpressroom%5D-%5Bhearing-of-commissioner-designate-hoekstra%5D%26&title=European%20Parliame

Unable to access = https://twitter.com/intent/tweet?text=Spanish%20Presidency%20debriefs%20EP%20committees%20on%20priorities&url=https://www.europarl.europa.eu/news/en/press-room/20230904IPR04608/spanish-presidency-debriefs-ep-committees-on-priorities?xtor%3DAD-78-%5BSocial_share_buttons%5D-%5Btwitter%5D-%5Ben%5D-%5Bnews%5D-%5Bpressroom%5D-%5Bes-presidency-in-committees%5D-&via=Europarl_EN
HTTP Error 302: The HTTP server returned a redirect error that would lead to an infinite loop.
The last 30x error message was:
Found
num. of URLs in stack: 500 
Accessing = https://www.linkedin.com/shareArticle?mini=true&url=https://www.europarl.europa.eu/news/en/press-room/20230904IPR04608/spanish-presidency-debriefs-ep-committees-on-priorities?xtor%3DAD-78-%5BSocial_share_buttons%5D-%5Blinkedin%5D-%5Ben%5D-%5Bnews%5D-%5Bpressroom%5D-%5Bes-presidency-in-committees%5D%26&title=European%20Parliament&summary=&source=
num. of URLs in stack: 499 
Accessing = https://api.whatsapp.com/send?text=https://www

num. of URLs in stack: 548 
Accessing = https://www.europarl.europa.eu/news/en/press-room/20221209IPR64426/eu-long-term-budget-needs-urgent-revision-to-cope-with-current-crises
***This is a press release***
***This is a press release, and contain the word 'crisis'***
Saved to: ./2_1.txt
num. of URLs in stack: 551 
Accessing = https://www.facebook.com/share.php?u=https://www.europarl.europa.eu/news/en/press-room/20230918IPR05434/three-countries-to-receive-nearly-EU455-million-in-eu-aid-after-natural-disasters?xtor%3DAD-78-%5BSocial_share_buttons%5D-%5Bfacebook%5D-%5Ben%5D-%5Bnews%5D-%5Bpressroom%5D-%5Beusf-support-for-romania-italy-turkiye%5D-
num. of URLs in stack: 550 
Accessing = https://twitter.com/intent/tweet?text=Three%20countries%20to%20receive%20nearly%20%E2%82%AC455%20million%20in%20EU%20aid%20after%20natural%20disasters&url=https://www.europarl.europa.eu/news/en/press-room/20230918IPR05434/three-countries-to-receive-nearly-EU455-million-in-eu-aid-after-natural-disasters?xtor%

Unable to access = https://twitter.com/intent/tweet?text=Democracy%20in%20Action:%20Parliament's%20priorities%20and%20achievements&url=https://www.europarl.europa.eu/news/en/press-room/press-tool-kit/1/democracy-in-action-parliament-s-priorities-and-achievements?xtor%3DAD-78-%5BSocial_share_buttons%5D-%5Btwitter%5D-%5Ben%5D-%5Bnews%5D-%5Bpress-kit-2021%5D-%5Bpage_press-kit-2021%5D-&via=Europarl_EN
HTTP Error 302: The HTTP server returned a redirect error that would lead to an infinite loop.
The last 30x error message was:
Found
num. of URLs in stack: 555 
Accessing = https://www.linkedin.com/shareArticle?mini=true&url=https://www.europarl.europa.eu/news/en/press-room/press-tool-kit/1/democracy-in-action-parliament-s-priorities-and-achievements?xtor%3DAD-78-%5BSocial_share_buttons%5D-%5Blinkedin%5D-%5Ben%5D-%5Bnews%5D-%5Bpress-kit-2021%5D-%5Bpage_press-kit-2021%5D%26&title=European%20Parliament&summary=&source=
num. of URLs in stack: 554 
Accessing = https://api.whatsapp.com/send?text=h

***This is a press release***
num. of URLs in stack: 668 
Accessing = https://www.europarl.europa.eu/news/en/press-room/20230310IPR77228/meps-back-plans-for-a-climate-neutral-building-sector-by-2050
***This is a press release***
num. of URLs in stack: 671 
Accessing = https://www.europarl.europa.eu/news/en/press-room/20221212IPR64509/deal-reached-on-new-carbon-leakage-instrument-to-raise-global-climate-ambition
num. of URLs in stack: 675 
Accessing = https://www.europarl.europa.eu/news/en/press-room/20221205IPR60607/deal-on-new-law-to-ensure-products-causing-deforestation-are-not-sold-in-the-eu
num. of URLs in stack: 679 
Accessing = https://www.europarl.europa.eu/news/en/press-room/20230414IPR80129/parliament-adopts-new-law-to-fight-global-deforestation
***This is a press release***
num. of URLs in stack: 682 
Accessing = https://www.europarl.europa.eu/news/en/press-room/20221107IPR49611/sustainable-economy-parliament-adopts-new-reporting-rules-for-multinationals
***This is a press re

In [5]:
# Github Repo: https://github.com/yd2658/b9122_homework2
# Use this command to start: git clone https://github.com/yd2658/b9122_homework2.git