In [None]:
import http.client
import urllib.parse
import json
import math
import time
import requests
import pandas as pd
from tqdm import tqdm

api_token = "MyapLLgIHDl11lMHGVsPgOhn2apg7fkTl8SWNWnQ"
search_topic = 'crime'
source_domains = ['cnn.com', 'foxnews.com']
limit = 25
# API Constraint
MAX_RETURN = 20000
max_page = MAX_RETURN // limit

params = urllib.parse.urlencode({
    'api_token': api_token,
    'search': 'crime',
    'domains': ['cnn.com', 'foxnews.com'],
    'limit': limit
})

locale = 'us'




In [None]:
def fetch_news_articles(api_token, search_topic, source_domain, limit, locale):
    """
    Fetch news articles from TheNewsAPI with pagination.
    
    Parameters:
    -----------
    api_token : str
        API authentication token
    search_topic : str
        Search keyword/topic
    source_domain : str
        News source domain (e.g., 'foxnews.com')
    limit : int
        Number of articles per page
        
    Returns:
    --------
    df : pd.DataFrame
        DataFrame containing all fetched articles, sorted by published_at
    """
    # API constraints
    MAX_RETURN = 20000
    max_page = MAX_RETURN // limit
    
    # Build initial params
    params = urllib.parse.urlencode({
        'api_token': api_token,
        'search': search_topic,
        'domains': source_domain,
        'limit': limit,
        'locale': locale
    })
    
    # STEP 1: Initial API request to get metadata (from cell 1)
    content = ""
    try:
        response = requests.get(f"https://api.thenewsapi.com/v1/news/all?{params}")
        raw_content = response.content
        content = raw_content.decode('utf-8')
        parsed_data = json.loads(content)
        print(json.dumps(parsed_data, indent=4))
    except Exception as e:
        print(f"{e}")
        return pd.DataFrame()
    
    # STEP 2: Calculate pagination hyperparameters (from cell 2)
    found = parsed_data['meta']['found']
    found_page = math.ceil(found / limit)
    pages_to_fetch = min(found_page, max_page)
    
    # Fetch last page to get furthest (oldest) news date
    last_page_params = urllib.parse.urlencode({
        'api_token': api_token,
        'search': search_topic,
        'domains': source_domain,
        'limit': limit,
        'page': found_page,
        'locale': locale
    })
    
    try:
        response = requests.get(f"https://api.thenewsapi.com/v1/news/all?{last_page_params}")
        last_page_data = json.loads(response.content.decode('utf-8'))
        
        # print(json.dumps(last_page_data, indent=4))
        # if last_page_data['data']:
        #     furthest_date = last_page_data['data'][-1]['published_at']
        #     print(f"Furthest (oldest) news date: {furthest_date}")
    except Exception as e:
        print(f"Error fetching last page: {e}")
    
    # STEP 3: Fetch all articles (from cell 3)
    all_articles = []
    all_articles.extend(parsed_data['data'])
    
    with tqdm(total=pages_to_fetch, leave=True, position=0) as pbar:
        for page in range(1, pages_to_fetch + 1):
            pbar.desc = f"Fetching page {page}"
            
            page_params = urllib.parse.urlencode({
                'api_token': api_token,
                'search': search_topic,
                'domains': source_domain,
                'limit': limit,
                'page': page,
                'locale': locale
            })
            
            try:
                response = requests.get(f"https://api.thenewsapi.com/v1/news/all?{page_params}")
                raw_content = response.content
                content = raw_content.decode('utf-8')
                parsed_data = json.loads(content)
                
                all_articles.extend(parsed_data['data'])
                
                time.sleep(0.5)
                
            except Exception as e:
                print(f"Error on page {page}: {e}")
                continue
            
            pbar.update(1)
    
    # STEP 4: Create DataFrame (from cell 4)
    df = pd.DataFrame(all_articles)
    df.sort_values(by='published_at', inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    return df

In [None]:
# Fetch articles for all configured sources
articles_by_source = {}
for source_domain in source_domains:
    source_name = source_domain.split('.')[0]
    articles_by_source[source_name] = fetch_news_articles(
        api_token=api_token,
        search_topic=search_topic,
        source_domain=source_domain,
        limit=limit,
        locale=locale
    )

{
    "meta": {
        "found": 3505,
        "returned": 25,
        "limit": 25,
        "page": 1
    },
    "data": [
        {
            "uuid": "8ff40aee-6754-4363-8d68-2ba77ea03dcf",
            "title": "DC crime law: Why Joe Biden is playing defense on crime",
            "description": "The Senate this week passed a Republican-led resolution to overturn a Washington, DC, crime law, which critics have argued is soft on violent criminals.",
            "keywords": "2020 presidential election, brand safety-nsf crime, brand safety-nsf sensitive, continents and regions, crime, law enforcement and corrections, criminal offenses, domestic alerts, domestic-2020 elections, domestic-us news, domestic-us politics, elections (by type), elections and campaigns, government and public administration, iab-crime, iab-elections, iab-politics, international alerts, international-us 2020 elections, international-us news, international-us politics, joe biden, new york (state), new york city, n

Fetching page 141: 100%|██████████| 141/141 [02:31<00:00,  1.08s/it]


{
    "meta": {
        "found": 10317,
        "returned": 25,
        "limit": 25,
        "page": 1
    },
    "data": [
        {
            "uuid": "d432495b-5cf8-478d-945d-dbed42f52ab1",
            "title": "Trump claims 'we're against crime. Democrats like crime'",
            "description": "Trump predicts crime will dominate 2026 midterm elections as he defends federalization efforts in Washington, D.C. involving National Guard deployment.",
            "keywords": "",
            "snippet": "NEW You can now listen to Fox News articles!\n\nPresident Donald Trump on Tuesday told reporters that he believes crime will be a major focus in the 2026 midterm ...",
            "url": "https://www.foxnews.com/politics/trump-claims-were-against-crime-democrats-like-crime",
            "image_url": "https://static.foxnews.com/foxnews.com/content/uploads/2025/08/trump-oval-office-chicago-crackdown-82225.jpg",
            "language": "en",
            "published_at": "2025-08-26T22:42:47

Fetching page 413: 100%|██████████| 413/413 [07:43<00:00,  1.12s/it]


In [None]:
import os

output_dir = 'data/news'
os.makedirs(output_dir, exist_ok=True)

# Save individual source files
for source_name, df in articles_by_source.items():
    df.to_csv(f'data/news/all_articles_{source_name}.csv', index=False)

# Concatenate all source dataframes
news_df = pd.concat(articles_by_source.values(), ignore_index=True)

# Create date column in yyyy/mm format from published_at
news_df['date'] = pd.to_datetime(news_df['published_at']).dt.strftime('%Y/%m')

news_df.sort_values(by='date', inplace=True)
news_df.reset_index(drop=True, inplace=True)
news_df.to_csv('data/news/all_articles_combined.csv', index=False)

NameError: name 'fox_df' is not defined

{
    "meta": {
        "found": 0,
        "returned": 0,
        "limit": 1,
        "page": 1
    },
    "data": []
}
