In [None]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup as bs
import re

In [None]:
class NYTimesParser():
    """
    Class for parsing NYTimes articles using NYTimes sitemaps
    """

    def __init__(self):
        """
        Initializes NYTimesParser obj
        """

    def parse_articles(self, start_year, end_year):
        """
        Parses articles from start_year to end_year; returns pandas.Dataframe with extracted data
        """

        # --------- pandas.Dataframe() where all data is saved --------- #
        df = pd.DataFrame(columns= ['date', 'headline', 'author', 'topic', 'text', 'link'])

        # -------------- creates list of year being parsed --------------- #
        start_end = [x for x in range(start_year, end_year+1)]

        # --- generates list of sitemap urls used to extract articles ---- #
        sitemap_urls = []
        for year in start_end:
            for month in range(1, 13):
                if (month < 10):
                    sitemap_urls.append(f"https://www.nytimes.com/sitemaps/new/sitemap-{year}-0{month}.xml.gz")
                else: 
                    sitemap_urls.append(f"https://www.nytimes.com/sitemaps/new/sitemap-{year}-{month}.xml.gz")

        # ---------  parses through each sitemap link in sitemap_urls --------- #
        for sitemap in sitemap_urls:
            page = requests.get(sitemap)
            soup = bs(page.content)
            links = [link.getText() for link in soup.find_all("loc")]

            # -------- parses through all article links in sitemap link -------- #
            for link in links:
                page = requests.get(link)
                temp_soup = bs(page.content)

                # ------------------- data being saved to df ------------------- #
                # ------- if data non-existent/not found, value = 'None' ------- #

                # ----------------- main body of article ----------------------- #
                text = " ".join([p.getText() for p in temp_soup.find_all("p")])

                # --- date; saves none to all values if date does not exist/cannot be found --- #
                date = re.findall(r'/(\d{4})/(\d{1,2})/(\d{1,2})/', link)
                try: 
                    year = int(date[0][0])
                    month = int(date[0][1])
                    day = int(date[0][2])
                except IndexError:
                    year = 'None'
                    month = 'None'
                    day = 'None'

                # ----------- headline; assigns none if no headline --------- #
                headline = ''
                try:
                    headline = temp_soup.find("h1").getText()
                except AttributeError:
                    headline = 'None'

                # ------------- author; assigns none of no topic ------------ #
                author = ''
                try:
                    author = temp_soup.find("a", {'class':'css-mrorfa'}).getText()
                except AttributeError:
                    author = 'None'

                # ------------ topic; assigns none if no topic -------------- #
                topic = ''
                try:
                    topic = temp_soup.find("a", {'class':'css-nuvmzp'}).getText()
                except AttributeError:
                    topic = 'None'

                # ----------------- data appended to df --------------------- #
                df = df.append(
                    {
                        'date': f'{year}-{month}-{day}',
                        'headline': headline,
                        'author': author,
                        'topic': topic,
                        'text': text,
                        'link': link
                        }, ignore_index=True)

        # ------------- return created df ------------- #
        return df

In [None]:
# --- create NYTimesParser object
parser = NYTimesParser()

In [None]:
# ---- Parses articles given start and end year ---- #
df = parser.parse_articles(start_year = 2018, end_year = 2019)