In [7]:
import os
import re

import csv

import requests

from bs4 import BeautifulSoup

In [8]:
def get_text(soup):
    text = ""
    for paragraph in soup.find_all('p'):
        if not paragraph.has_attr('class') and not paragraph.findChildren('b'):
            text += paragraph.get_text() + "\n\n"
            
    return text

In [9]:
def get_npr_urls(soup):
    
    pattern = r'20[\d]{2}/[\d]{1,2}/[\d]{1,2}/[\d]+/'
    
    urls = []
    for link in soup.find_all('a', href=True):
        url = link['href']
        if (url.startswith('https://www.npr.org/')):
            if re.search(pattern, url):
                urls.append(url)
        
    return urls

In [10]:
def get_title(soup):
    title = ""
    #find href with class == title, then get b
    for link in soup.find_all('a'):
        if link.has_attr('class'):
            if link['class'][0] == "title":
                title = link.findChildren('b')[0].get_text()
    
    return title

In [11]:
def get_details(url):
    pattern = r'20[\d]{2}/[\d]{1,2}/[\d]{1,2}/[\d]+/'
    
    matchObject = re.search(pattern, url, flags=0)
    match_split = matchObject[0].split("/")
    date = match_split[0] + "-" + match_split[1] + "-" + match_split[2]
    article_id = match_split[3]
    
    return date, article_id

In [12]:
##############
## 

start_url = "https://www.npr.org/"
traversed_urls = set()

to_traverse = set()
to_traverse.add(start_url)

article_dict_list = []

article_count = 0
while len(to_traverse) > 0 and article_count < 200:
    url = to_traverse.pop()
    traversed_urls.add(url)
    
    soup = BeautifulSoup(requests.get(url).text, 'html.parser')
    
    links = get_npr_urls(soup)
    
    urls = [x for x in get_npr_urls(soup) if x not in traversed_urls]
    to_traverse.update(urls)
    
    text = get_text(soup)
    
    if len(text) > 100:
        title = get_title(soup)
        date, article_id = get_details(url)
        
        article_dict = {"Text": text, "Link": url, "Title": title, "Date": date, "ID": article_id}
        article_dict_list.append(article_dict)
    
    article_count += 1

In [13]:
## Get date and (article title?) and article id. 
## add line to csv, and write text to text file with article id
folder_out = 'data/articles/'
if not os.path.exists(folder_out):
    os.makedirs(folder_out)

with open(folder_out + "article_log.csv", 'w') as csvfile:
    fieldnames = article_dict_list[0].keys()
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    
    writer.writeheader()
    for article_dict in article_dict_list:
        writer.writerow(article_dict)
        with open (folder_out + article_dict["ID"] + ".txt", 'w') as article_hdl:
            article_hdl.write(article_dict["Text"])