In [23]:
import http.client
import urllib.parse
import json
import math
import time
import requests
import pandas as pd
from tqdm import tqdm

api_token = "pub_52b33707255441248eeaddfbc868ec3a"
search_topic = 'crime'
source_domains = ['edition.cnn.com', 'foxnews.com']
limit = 25
# API Constraint
MAX_RETURN = 20000
max_page = MAX_RETURN // limit


locale = 'us'

In [24]:
def fetch_news_articles(api_key, search_topic, source_domain, limit, locale, max_articles_to_fetch=None):
    """
    Fetch news articles from NewsData.io with pagination, using the original two-stage logic.
    """
    # STEP 1: Initial API request to get metadata
    all_articles = []
    params = {
        'apikey': api_key,
        'category': search_topic,
        'domainurl': source_domain,
        'size': min(limit, 10),
        'language': 'en' if locale == 'us' else locale,
    }
    try:
        response = requests.get("https://newsdata.io/api/1/news", params=params)
        response.raise_for_status()
        parsed_data = response.json()
        all_articles.extend(parsed_data.get('results', []))
    except Exception as e:
        return pd.DataFrame()

    # STEP 2: Calculate pagination hyperparameters
    total_results = parsed_data.get('totalResults', 0)
    if max_articles_to_fetch:
        total_results = min(total_results, max_articles_to_fetch)
        
    pages_to_fetch = math.ceil(total_results / params['size'])
    next_page_token = parsed_data.get('nextPage')

    # STEP 3: Fetch all subsequent articles
    with tqdm(total=pages_to_fetch, initial=1, leave=True, position=0) as pbar:
        pbar.desc = f"Fetching {source_domain}"
        for _ in range(1, pages_to_fetch):
            if not next_page_token:
                break
                
            pbar.desc = f"Fetching page"
            params['page'] = next_page_token
            
            try:
                response = requests.get("https://newsdata.io/api/1/news", params=params)
                response.raise_for_status()
                parsed_data = response.json()
                
                articles = parsed_data.get('results', [])
                if not articles:
                    break
                
                all_articles.extend(articles)
                next_page_token = parsed_data.get('nextPage')
                time.sleep(0.5)
                
            except Exception as e:
                break
            
            pbar.update(1)
    
    # STEP 4: Create DataFrame
    if max_articles_to_fetch:
        df = pd.DataFrame(all_articles[:max_articles_to_fetch])
    else:
        df = pd.DataFrame(all_articles)
    return df

In [25]:
# Fetch articles for all configured sources
articles_by_source = {}
for source_domain in source_domains:
    source_name = source_domain.split('.')[0]
    articles_by_source[source_name] = fetch_news_articles(
        api_key=api_token, # Using api_token from setup cell
        search_topic=search_topic,
        source_domain=source_domain,
        limit=limit,
        locale=locale,
        max_articles_to_fetch=None # Early break for testing
    )

# Concatenate into single DataFrame with source column
all_articles_df = pd.concat([df.assign(source=name) for name, df in articles_by_source.items() if not df.empty], ignore_index=True)
all_articles_df.to_csv('data/news/all_articles_newsdataio.csv', index=False)

Fetching page: 100%|██████████| 2/2 [00:00<00:00,  1.40it/s]
Fetching page: 100%|██████████| 3/3 [00:01<00:00,  1.38it/s]
