# White House - web crawler

In [25]:
import requests
from os import makedirs
from os.path import join
from bs4 import BeautifulSoup
from glob import glob

---
### Building a web crawler that downloades the headers in the Briefing Room

The code below, can crawl through the pages of the WH and download the html content in txt format in a folder called 'index-page'.

In [23]:
# Setting up constant vars
makedirs('index-pages', exist_ok=True) #creates a directory titled index pages if it does not already exist
THE_URL = 'https://www.whitehouse.gov/briefing-room/' # our base url
INDEX_PAGES_DIR = "index-pages" # the directory we set up and in which we save the sites

# Page numbers to be saved:
max_page_num = 1 #adjust this according to the max amount of pages in the Briefing Room

# Loop through the urls to save them
for page_num in range(0, max_page_num): 
    resp = requests.get(THE_URL, params={'page': page_num})
    print("Downloading", resp.url)
    
    fname = join(INDEX_PAGES_DIR, '{}.html'.format(page_num))
    with open(fname, "w") as wf:
        print("Saving as", fname)
        wf.write(resp.text)

Downloading https://www.whitehouse.gov/briefing-room/?page=0
Saving as index-pages/0.html


----
### Parsing the HTML files, extracting URLS using Beautiful soup


In [28]:
for page in glob('index-pages/*.html'):
    with open(page, 'r') as file:
        x = file.read()
    soup = BeautifulSoup(x, 'lxml')
    
    # after inspecting the HTML, I noticed all releases have the attr 'class:news-item__title', which can be used to filter only
    # relevant links
    releases = soup.findAll('a',attrs={'class':'news-item__title'}) 
    actual_web_links = [release['href'] for release in releases]

actual_web_links[:2] # to check that we have gotten the correct links

['https://www.whitehouse.gov/briefing-room/statements-releases/2023/12/17/statement-from-president-joe-biden-on-the-80th-anniversary-of-the-repeal-of-the-chinese-exclusion-act/',
 'https://www.whitehouse.gov/briefing-room/presidential-actions/2023/12/17/president-biden-announces-presidential-delegation-to-the-state-of-kuwait-to-pay-respects-upon-the-death-of-his-highness-sheikh-nawaf-al-ahmad-al-sabah-amir-of-the-state-of-kuwait/']