### Imports

In [1]:
# NYTimes News Data
from bs4 import BeautifulSoup
import requests

# Utility
from pathlib import Path
from os import path
import json
import time
import random

### Functions

In [2]:
def get_filepaths(directory_path: str):
    # Get all .json files (including subdirectories) that do not start with 'fulltext'
    json_filepaths = [str(file) for file in Path(directory_path).rglob("*.json") if not file.name.startswith("fulltext")]

    return json_filepaths

In [3]:
def filter_scraped_files(directory_path: str):
    # Get all .json files without 'fulltext' prefix
    json_filepaths = [str(file) for file in Path(directory_path).rglob("*.json") if not file.name.startswith("fulltext")]
    
    # Create a set of filenames of fulltext files
    fulltext_files = {str(file).replace("fulltext_", "") for file in Path(directory_path).rglob("fulltext_*.json")}
    
    # Filter out files that have a corresponding fulltext version
    remaining_files = [file for file in json_filepaths if file not in fulltext_files]
    
    return remaining_files

In [4]:
def to_json(filepath: str, export_file: list[dict], verbose: bool):
    # Modify filepath for export
    directory, filename = path.split(filepath)
    export_filename = f"fulltext_{filename}"
    export_filepath = path.join(directory, export_filename)

    # Export as .json
    with open(export_filepath, 'w') as json_file:
        json.dump(export_file, json_file, indent=4)

    if verbose:
        print(f"Processed '{export_filepath}'")

In [5]:
def fall_back_scrape_full_text(url: str):
    # Fall-back
    # Load HTML content
    response_fallback = requests.get(url)

    # Create a BeautifulSoup object
    soup_fallback = BeautifulSoup(response_fallback.text, 'html.parser')

    # Extract text from HTML
    article_container = soup_fallback.find('article')

    # Extract text from found element
    if article_container:
        full_text = "\n".join([p.get_text(strip=True) for p in article_container.find_all('p')])  # Extract paragraphs
    else:
        full_text = "Article content not found"

    return full_text

In [9]:
def scrape_nyt_url(json_filepaths: list[str]):
    for file in json_filepaths:
        # Open .json
        with open(file, "r") as f:
            response = json.load(f)  # Load JSON data into a dictionary

        # ---

        # Get all web urls
        web_url_list = [article['web_url'] for article in response['response']['docs']]

        # List to store article data for export
        list_of_dict = []

        # ---
        
        # Iterate through web urls
        for url in web_url_list:
            # Define cookies (Inspect Element -> Application -> Cookies)
            cookies = {
                'NYT-S': '0^CBsSMgjA89K9BhDlsoC-BhoSMS3vHQ4I8sRMXKS37rBEiuLmIJyp_IIBKgIeVTjogcO9BkIAGkCRjMn5pSdItU9C4SNSrJhz0JM7gT4dYyYN4c848033teMQvSH1n48eUuNTG1aHE8cyB-Ep_-BgQA3y5gJ3hCEC',
                'nyt-a': 'jp4fDxi0FZZqELYzNbf4yQ'
            }

            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
            }

            # Use session with cookies
            session = requests.Session()
            response = session.get(url, headers=headers, cookies=cookies)

            # Parse the content
            soup = BeautifulSoup(response.text, 'html.parser')

            # Extract title (either <h1> or <title>)
            title = soup.find('h1')
            if not title:
                title = soup.find('title')

            # Extract full article text (either from <section> or <article>)
            article_container = soup.find('section', {'name': 'articleBody'} or soup.find('article'))
            if article_container:
                full_text = "\n".join([p.get_text(strip=True) for p in article_container.find_all('p')])  # Extract paragraphs
            else:
                # Fall-back
                full_text = fall_back_scrape_full_text(url)

            # Prepare result as a dictionary
            result = {
                'url': url,
                'title': title.get_text(strip=True) if title else 'Title not found',
                'fulltext': full_text
            }

            list_of_dict.append(result)

            # Introduce a random delay between 1 and 3 seconds
            time.sleep(random.uniform(1, 5))

        to_json(filepath=file, export_file=list_of_dict, verbose=1)

### Execution

In [10]:
# Get filepaths
# json_filepaths = get_filepaths(directory_path = "../../data/raw/")
# print(len(json_filepaths))
# json_filepaths[0:3]

# Get remaining filepaths
json_filepaths = filter_scraped_files(directory_path = "../../data/raw/")
print(len(json_filepaths))
json_filepaths[0:3]

465


['../../data/raw/2022/organizations_Amazon.com_Inc_mth10_pg0.json',
 '../../data/raw/2022/organizations_Amazon.com_Inc_mth11_pg0.json',
 '../../data/raw/2022/organizations_Microsoft_Corp_mth01_pg0.json']

In [11]:
scrape_nyt_url(json_filepaths)

Processed '../../data/raw/2022/fulltext_organizations_Amazon.com_Inc_mth10_pg0.json'
Processed '../../data/raw/2022/fulltext_organizations_Amazon.com_Inc_mth11_pg0.json'
Processed '../../data/raw/2022/fulltext_organizations_Microsoft_Corp_mth01_pg0.json'
Processed '../../data/raw/2022/fulltext_organizations_Amazon.com_Inc_mth07_pg1.json'
Processed '../../data/raw/2022/fulltext_organizations_Amazon.com_Inc_mth06_pg1.json'
Processed '../../data/raw/2022/fulltext_organizations_Amazon.com_Inc_mth07_pg0.json'
Processed '../../data/raw/2022/fulltext_organizations_Amazon.com_Inc_mth06_pg0.json'
Processed '../../data/raw/2022/fulltext_organizations_Microsoft_Corp_mth01_pg1.json'
Processed '../../data/raw/2022/fulltext_organizations_Amazon.com_Inc_mth03_pg2.json'
Processed '../../data/raw/2022/fulltext_organizations_Amazon.com_Inc_mth10_pg1.json'
Processed '../../data/raw/2022/fulltext_organizations_Amazon.com_Inc_mth11_pg1.json'
Processed '../../data/raw/2022/fulltext_organizations_Amazon.com_

KeyboardInterrupt: 