In [28]:
from datetime import date
import math

# Configuration variables
g_keywords = ["chinese", "japanese", "korean", "okinawan", "taiwanese", "tibetan", "\"east+asian\"", "oriental", "chinaman", "chinamen", "jap", "chink", "coolie", "celestial"]
g_search_date_range_start = date(1850, 1, 1)
g_search_date_range_end = date(2024, 7, 9)
g_num_articles_per_time_period = 20
g_time_block_range_years = 5

g_search_date_range_days = (g_search_date_range_end - g_search_date_range_start).days
g_num_time_blocks = math.ceil(g_search_date_range_days / (365 * g_time_block_range_years))

print("Number of time blocks: " + str(g_num_time_blocks))
print("Number of articles to download: " + str(g_num_time_blocks * g_num_articles_per_time_period))

g_time_block_number = int(input("Pick time block, 0-" + str(g_num_time_blocks - 1) + ": "))
if g_time_block_number < 0 or g_time_block_number >= g_num_time_blocks:
    print("Invalid time block!")
    exit()


def get_starting_date(time_block_num, search_date_range_start, time_block_range_years):
    return date(search_date_range_start.year + time_block_num * time_block_range_years, 1, 1)


def get_ending_date(time_block_num, search_date_range_start, search_date_range_end, time_block_range_years):
    ending_date = date(search_date_range_start.year + (time_block_num + 1) * time_block_range_years - 1, 12, 31)
    if ending_date > search_date_range_end:
        return search_date_range_end
    else:
        return ending_date


g_start_date = get_starting_date(g_time_block_number, g_search_date_range_start, g_time_block_range_years)
g_end_date = get_ending_date(g_time_block_number, g_search_date_range_start, g_search_date_range_end, g_time_block_range_years)

print("Start date: " + g_start_date.strftime('%Y-%m-%d'))
print("End date: " + g_end_date.strftime('%Y-%m-%d'))
print("Length of time period in days: " + str((g_end_date - g_start_date).days))

Number of time blocks: 35
Number of articles to download: 700
Start date: 1850-01-01
End date: 1854-12-31
Length of time period in days: 1825


Load functions

In [41]:
import random
import webbrowser
from pathlib import Path
from PIL import Image
from pytesseract import pytesseract
import json
import cv2


def parse_time_block_str(time_blocks_str):
    unprocessed_time_block_str_list = time_blocks_str.split(", ")
    time_block_list = []
    for time_block_str in unprocessed_time_block_str_list:
        if "-" in time_block_str:
            time_block_range = time_block_str.split("-")
            time_block_list.extend(range(int(time_block_range[0]), int(time_block_range[1]) + 1))
        else:
            time_block_list.append(int(time_block_str))
    return time_block_list


def get_time_block_number(year, search_date_range_start, time_block_range_years):
    return math.floor((year - search_date_range_start.year) / time_block_range_years)


def get_keywords_with_num_hits(keywords, start_date, end_date):
    keywords_with_num_hits = {}
    for keyword in keywords:
        webbrowser.open("https://www.newspapers.com/search/results/?country=us&date-end=" + end_date.strftime('%Y-%m-%d') + "&date-start=" + start_date.strftime('%Y-%m-%d') + "&entity-types=page&keyword=" + keyword)
        keywords_with_num_hits[keyword] = int(input("Number of hits: ").replace(",", ""))
    return keywords_with_num_hits


def get_keywords_with_num_articles(keywords_with_num_hits, num_articles_per_time_period):
    total_hits = sum(keywords_with_num_hits.values())
    article_remainders = []
    keywords_with_num_articles = {}
    for keyword, hits in keywords_with_num_hits.items():
        keywords_with_num_articles[keyword] = int((hits / total_hits) * num_articles_per_time_period)
        article_remainders.append((keyword, ((hits / total_hits) * num_articles_per_time_period) % 1))
    article_remainders.sort(key=lambda keyword_with_article_remainder: keyword_with_article_remainder[1], reverse=True)
    num_searches_short = num_articles_per_time_period - sum(keywords_with_num_articles.values())
    for i in range(num_searches_short):
        keywords_with_num_articles[article_remainders[i][0]] += 1
    return keywords_with_num_articles


def get_keywords_with_urls_and_num_articles(keywords_with_num_articles, start_date, end_date):
    keywords_with_urls_and_num_articles = []
    for keyword, num_articles in keywords_with_num_articles.items():
        keywords_with_urls_and_num_articles.append({
            "keyword": keyword,
            "url": "https://www.newspapers.com/search/results/?country=us&date-end=" + end_date.strftime('%Y-%m-%d') + "&date-start=" + start_date.strftime('%Y-%m-%d') + "&entity-types=page&keyword=" + keyword,
            "num_articles": num_articles
        })
    return keywords_with_urls_and_num_articles


def run_article_download_helper(keywords_with_urls_and_num_articles):
    for keyword_with_urls_and_num_articles in keywords_with_urls_and_num_articles:
        if keyword_with_urls_and_num_articles["num_articles"] != 0:
            text_input = input("Please download " + str(keyword_with_urls_and_num_articles["num_articles"]) + " articles with the keyword \"" + keyword_with_urls_and_num_articles["keyword"] + "\". Press return to continue. Type \"stop\" to stop.")
            if text_input == "stop":
                break
            webbrowser.open(keyword_with_urls_and_num_articles["url"])
            text_input = input("Once you have downloaded " + str(keyword_with_urls_and_num_articles["num_articles"]) + " articles with the keyword \"" + keyword_with_urls_and_num_articles["keyword"] + "\", press return to continue. Type \"stop\" to stop.")
            if text_input == "stop":
                break



def get_info_from_file_stem(file_stem):
    parts = file_stem.split('_')
    newspaper_name_parts = []

    for part in parts:
        if part.isdigit() and len(part) == 4:
            year_index = parts.index(part)
            break
        newspaper_name_parts.append(part)

    newspaper_name = ' '.join(newspaper_name_parts)
    date_published = date(int(parts[year_index]), int(parts[year_index + 1]), int(parts[year_index + 2]))

    return newspaper_name, date_published


def write_contexts_to_file(contexts, target_dir_path):
    if not target_dir_path.exists():
        target_dir_path.mkdir()
    for time_period, contexts_in_time_period in contexts.items():
        with open(target_dir_path / f"{time_period}.json", 'w+') as contexts_file:
            json.dump(contexts_in_time_period, contexts_file, indent=4)



def from_contexts_files_sentiment_analysis_to_files(time_block_numbers):
    for time_block_number in time_block_numbers:
        with open('contexts/time_period_' + str(time_block_number) + "_contexts.json") as contexts_file:
            contexts = json.load(contexts_file)
            contexts_with_sentiment = {}
            for keyword, date_and_contexts in contexts.items():
                contexts_with_sentiment[keyword] = []
                for date_and_context in date_and_contexts:
                    doc = nlp_sentiment_analysis(date_and_context['context'])
                    contexts_with_sentiment[keyword].append({                        'date': date,
                        'polarity': doc._.blob.polarity,
                        'subjectivity': doc._.blob.subjectivity
                    })

Get the number of articles needed for each keyword by pasting in the number of hits for each keyword

In [24]:
g_keywords_with_num_articles = get_keywords_with_num_articles(get_keywords_with_num_hits(g_keywords, g_start_date, g_end_date), g_num_articles_per_time_period)
print(g_keywords_with_num_articles)

{'chinese': 9, 'japanese': 8, 'korean': 0, 'okinawan': 0, 'taiwanese': 0, 'tibetan': 0, '"east+asian"': 0, 'oriental': 0, 'chinaman': 2, 'chinamen': 1, 'jap': 0, 'chink': 0, 'coolie': 0, 'celestial': 0}


Generate the URLs to grab the articles from. Note: number of URLs is not necessarily equal to number of articles to download.

In [25]:
g_keywords_with_urls_and_num_articles = get_keywords_with_urls_and_num_articles(g_keywords_with_num_articles, g_start_date, g_end_date)
print("Generated " + str(len(g_keywords_with_urls_and_num_articles)) + " URLs")

Generated 14 URLs


Utility to guide users in downloading articles. Place articles in a folder with the same name as the keyword, which will be in the "articles" folder. Name the file with the date in the format "YYYY-MM-DD"

In [26]:
run_article_download_helper(g_keywords_with_urls_and_num_articles)

Load NLP

In [37]:
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob
import numpy as np
import re

nlp_sentence_segmentation = spacy.load("en_core_web_trf")
nlp_sentiment_analysis = spacy.load("en_core_web_trf")
nlp_sentiment_analysis.add_pipe("spacytextblob")


def get_all_contexts_for_keywords(articles_dir_path, time_period_list):
    keywords_with_contexts_by_time_period = {}
    for time_period in time_period_list:
        keywords_with_contexts_by_time_period[time_period] = {}
        time_period_dir_path = articles_dir_path / str(time_period)
        for articles_by_keyword_dir_path in time_period_dir_path.rglob("*"):
            if articles_by_keyword_dir_path.is_dir():
                keyword = articles_by_keyword_dir_path.name
                keywords_with_contexts_by_time_period[time_period][keyword] = []
                for article_path in articles_by_keyword_dir_path.rglob("*"):
                    if article_path.is_file():
                        newspaper_name, date = get_info_from_file_stem(article_path.stem)
                        img = cv2.imread(article_path, cv2.IMREAD_GRAYSCALE)
                        blur = cv2.GaussianBlur(img,(5, 5), 0)
                        th3 = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
                        text = pytesseract.image_to_string(th3)
                        cleaned_text = cleaned_text = re.sub(r'\s+', ' ', text.replace(" |", "").replace("|", "").replace("-\n", "").replace("\n", " "))
                        keywords_with_contexts_by_time_period[time_period][keyword].append({
                            "newspaper": newspaper_name,
                            "date": date.strftime('%Y-%m-%d'),
                            "keyword": keyword,
                            "text": cleaned_text
                        })
    return keywords_with_contexts_by_time_period

Get contexts surrounding keywords with OCR and sentence segmentation. Provide a list of numbers (e.g. 1, 2, 3) and/or a range (e.g. 10-15). Write to file in all_contexts

In [42]:
g_articles_dir_path = Path.cwd() / 'articles'
write_contexts_to_file(get_all_contexts_for_keywords(g_articles_dir_path, parse_time_block_str(input("Enter time periods. Provide a list of numbers (e.g. 1, 2, 3) and/or a range (e.g. 10-15)."))), Path.cwd() / 'contexts')

python(25887) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(25888) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(25889) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(25890) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(25891) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(25892) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(25893) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(25894) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(25895) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(25896) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(25898) Malloc

Narrow down contexts to only on per article. Contexts are presented to the user in a random order, and the first one that the user confirms is relevant is used. If none are confirmed, the context is replaced with a placeholder. Results are stored in a JSON file that should be manually edited to fix placeholders before moving on to analysis.

In [None]:
write_contexts_to_file(narrow_contexts(g_contexts_for_keywords), int(input("Pick time block number")))

Read in contexts from JSON file for sentiment analysis. Result is a dictionary with a dictionary with keys "date", "polarity", and "subjectivity" for each "keyword".

In [9]:
import cv2
import numpy as np

# Step 1: Load the image
image = cv2.imread(Path.cwd() / 'articles/0/indian/The_Washington_Union_1852_04_22_1.jpg')

# Step 2: Convert to grayscale
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# Step 3: Detect lines using Hough Transform
edges = cv2.Canny(gray, 50, 150, apertureSize=3)
lines = cv2.HoughLinesP(edges, 1, np.pi/180, threshold=100, minLineLength=100, maxLineGap=10)

# Initialize a mask with zeros (black)
mask = np.zeros_like(gray)

# Draw the lines on the mask
for line in lines:
    x1, y1, x2, y2 = line[0]
    cv2.line(mask, (x1, y1), (x2, y2), (255), thickness=5)

# Step 4: Create masks for each section (invert mask for segmenting)
mask = cv2.bitwise_not(mask)

# Step 5: Find contours and segment the image
contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
for i, contour in enumerate(contours):
    x, y, w, h = cv2.boundingRect(contour)
    section = image[y:y+h, x:x+w]
    cv2.imwrite(f'section_{i}.jpg', section)  # Save each section as a separate image

# Optionally display an image or section
cv2.imshow('Section', section)
cv2.waitKey(0)
cv2.destroyAllWindows()
