In [4]:
# !pip install spacy tqdm
# !python -m spacy download en_core_web_trf

In [6]:
import re

def clean_mentioned_people(raw_names):
    cleaned = []

    for name in raw_names:
        # Remove common dialogue verbs (e.g., saidBradley)
        name = re.sub(r"^(said|explained|wrote|noted|added)", "", name, flags=re.IGNORECASE)

        # Strip whitespace and filter short/empty
        name = name.strip()
        if len(name) < 3:
            continue

        # Skip known non-person entities
        if name.lower() in ["quanta magazine", "mark belan", "samantha mash"]:  # add more if needed
            continue

        cleaned.append(name)

    # Deduplicate and preserve order
    final = list(dict.fromkeys(cleaned))
    return final


In [None]:
import requests
from bs4 import BeautifulSoup
import spacy
import json
import time
from tqdm import tqdm
import urllib3

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# Load spaCy transformer model (NER)
nlp = spacy.load("en_core_web_trf")  # Or use "en_core_web_sm" for faster, smaller model

# Scrape article content
def scrape_article_content(url):
    try:
        response = requests.get(url, verify=False, timeout=15)
        soup = BeautifulSoup(response.content, "html.parser")
        paragraphs = soup.find_all("p")
        return "\n".join(p.get_text(strip=True) for p in paragraphs)
    except Exception as e:
        print(f" Failed to scrape {url}: {e}")
        return ""

# Run spaCy NER to extract people
from collections import Counter

def extract_people_with_spacy(article):
    doc = nlp(article["content"])
    raw_people = [ent.text.strip() for ent in doc.ents if ent.label_ == "PERSON"]
    cleaned_people = [re.sub(r"^(said|explained|noted|added)", "", p, flags=re.IGNORECASE).strip() for p in raw_people]
    filtered_people = [p for p in cleaned_people if len(p) >= 3 and p.lower() not in {"quanta magazine", "mark belan", "samantha mash"}]

    # Count occurrences
    name_counts = Counter(filtered_people)

    return {
        "author": article["author"],
        "title": article["title"],
        "url": article["url"],
        "mentioned_people": list(name_counts.keys()),
        "mention_counts": dict(name_counts)
    }

# def extract_people_with_spacy(article):
#     doc = nlp(article["content"])
#     raw_people = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
#     cleaned_people = clean_mentioned_people(raw_people)
#     return {
#         "author": article["author"],
#         "title": article["title"],
#         "url": article["url"],
#         "mentioned_people": cleaned_people
#     }

# Load article metadata (filtered to top authors)
with open("quanta_top20_articles.json", "r", encoding="utf-8") as f:
    articles = json.load(f)

# You can change this to run only a few for demo:
articles_to_process = articles

# Run extraction
results = []

for article in tqdm(articles_to_process, desc="Processing with spaCy NER"):
    content = scrape_article_content(article["url"])
    if not content.strip():
        continue
    article["content"] = content
    extracted = extract_people_with_spacy(article)
    results.append(extracted)
    time.sleep(1)  # optional, avoid spamming server

# Save output
with open("quanta_ner_people.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

print("Extraction complete! Results saved to quanta_ner_people.json.")


In [1]:
import spacy

# Load transformer-based spaCy NER model
nlp = spacy.load("en_core_web_trf")

  from .autonotebook import tqdm as notebook_tqdm
