In [13]:
import math
from datetime import date
import webbrowser

# Configuration variables
g_keywords = ["chinese", "chinaman", "chink"]
g_search_date_range_start = date(1850, 1, 1)
g_search_date_range_end = date(2024, 7, 9)
g_num_articles_per_time_period = 50
g_time_block_range_years = 5

g_search_date_range_days = (g_search_date_range_end - g_search_date_range_start).days
g_num_time_blocks = math.ceil(g_search_date_range_days / (365 * g_time_block_range_years))

def get_time_block_number(year, search_date_range_start, time_block_range_years):
    return math.floor((year - search_date_range_start.year) / time_block_range_years)

def get_starting_date(time_block_num, search_date_range_start, time_block_range_years):
    return date(search_date_range_start.year + time_block_num * time_block_range_years, 1, 1)

def get_ending_date(time_block_num, search_date_range_start, search_date_range_end, time_block_range_years):
    ending_date = date(search_date_range_start.year + (time_block_num + 1) * time_block_range_years - 1, 12, 31)
    if ending_date > search_date_range_end:
        return search_date_range_end
    else:
        return ending_date

def get_keywords_with_num_hits(keywords, start_date, end_date):
    keywords_with_num_hits = {}
    for keyword in keywords:
        webbrowser.open("https://www.newspapers.com/search/results/?country=us&date-end=" + end_date.strftime('%Y-%m-%d') + "&date-start=" + start_date.strftime('%Y-%m-%d') + "&entity-types=page&keyword=" + keyword)
        keywords_with_num_hits[keyword] = int(input("Number of hits: ").replace(",", ""))
    return keywords_with_num_hits

def get_keywords_with_num_articles(keywords_with_num_hits, num_articles_per_time_period):
    total_hits = sum(keywords_with_num_hits.values())
    article_remainders = []
    keywords_with_num_articles = {}
    for keyword, hits in keywords_with_num_hits.items():
        keywords_with_num_articles[keyword] = int((hits / total_hits) * num_articles_per_time_period)
        article_remainders.append((keyword, ((hits / total_hits) * num_articles_per_time_period) % 1))
    article_remainders.sort(key=lambda keyword_with_article_remainder: keyword_with_article_remainder[1], reverse=True)
    num_searches_short = num_articles_per_time_period - sum(keywords_with_num_articles.values())
    for i in range(num_searches_short):
        keywords_with_num_articles[article_remainders[i][0]] += 1
    return keywords_with_num_articles

def get_keywords_with_urls_and_num_articles(keywords_with_num_articles, start_date, end_date):
    keywords_with_urls_and_num_articles = []
    for keyword, num_articles in keywords_with_num_articles.items():
        keywords_with_urls_and_num_articles.append({
            "keyword": keyword,
            "url": "https://www.newspapers.com/search/results/?country=us&date-end=" + end_date.strftime('%Y-%m-%d') + "&date-start=" + start_date.strftime('%Y-%m-%d') + "&entity-types=page&keyword=" + keyword,
            "num_articles": num_articles
        })
    return keywords_with_urls_and_num_articles

print("Number of time blocks: " + str(g_num_time_blocks))
print("Number of articles to download: " + str(g_num_time_blocks * g_num_articles_per_time_period))

g_time_block_number = int(input("Pick time block, 0-" + str(g_num_time_blocks - 1) + ": "))
if g_time_block_number < 0 or g_time_block_number >= g_num_time_blocks:
    print("Invalid time block!")
    exit()

g_start_date = get_starting_date(g_time_block_number, g_search_date_range_start, g_time_block_range_years)
g_end_date = get_ending_date(g_time_block_number, g_search_date_range_start, g_search_date_range_end, g_time_block_range_years)

print("Start date: " + g_start_date.strftime('%Y-%m-%d'))
print("End date: " + g_end_date.strftime('%Y-%m-%d'))
print("Length of time period in days: " + str((g_end_date - g_start_date).days))

Number of time blocks: 35
Number of articles to download: 1750
Start date: 1850-01-01
End date: 1854-12-31
Length of time period in days: 1825


Get the number of articles needed for each keyword by pasting in the number of hits for each keyword

In [14]:
g_keywords_with_num_articles = get_keywords_with_num_articles(get_keywords_with_num_hits(g_keywords, g_start_date, g_end_date), g_num_articles_per_time_period)
print(g_keywords_with_num_articles)

ValueError: invalid literal for int() with base 10: ''

Generate the URLs to grab the articles from

In [11]:
g_keywords_with_urls_and_num_articles = get_keywords_with_urls_and_num_articles(g_keywords_with_num_articles, g_start_date, g_end_date)
print("Generated " + str(len(g_keywords_with_urls_and_num_articles)) + " URLs")

Generated 3 URLs


Utility to guide users in downloading articles

In [12]:
for keyword_with_urls_and_num_articles in g_keywords_with_urls_and_num_articles:
    text_input = input("Please download " + str(keyword_with_urls_and_num_articles["num_articles"]) + " articles with the keyword \"" + keyword_with_urls_and_num_articles["keyword"] + "\". Press return to continue. Type \"stop\" to stop.")
    if text_input == "stop":
        break
    webbrowser.open(keyword_with_urls_and_num_articles["url"])
    text_input = input("Press return to continue. Type \"stop\" to stop.")
    if text_input == "stop":
        break