In [None]:
import os
import json
import time
import textwrap
from bs4 import BeautifulSoup
from collections import defaultdict
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

#  Hugging Face Model 
model_name = "Jean-Baptiste/roberta-large-ner-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

#  Utility: Chunk text to avoid model length overflow 
def chunk_text(text, max_length=400):
    return textwrap.wrap(text, max_length)

def extract_people(text):
    mention_counts = defaultdict(int)
    chunks = chunk_text(text, max_length=400)
    for chunk in chunks:
        try:
            results = ner_pipeline(chunk)
            for ent in results:
                if ent["entity_group"] == "PER":
                    name = ent["word"].strip()
                    mention_counts[name] += 1
        except Exception as e:
            print(f" NER error: {e}")
    return dict(mention_counts)

#  Core function: extract from saved HTML file 
def process_html_file(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")

    paragraphs = soup.find_all("p", class_="paywall")
    if not paragraphs:  # fallback if paywall not found
        paragraphs = soup.find_all("p")

    text = " ".join(p.get_text(strip=True) for p in paragraphs)
    return text

#  Main pipeline 
if __name__ == "__main__":
    HTML_DIR = "/content/drive/MyDrive/OIA_NS/NS_rendered_html/"  # ← put your folder here
    OUTPUT_JSON = "/content/drive/MyDrive/OIA_NS/NS_people_mentions_ner.json"

    results = []

    for i, filename in enumerate(tqdm(sorted(os.listdir(HTML_DIR)), desc="Processing HTMLs", ncols=100)):
        if not filename.endswith(".html"):
            continue

        try:
            html_path = os.path.join(HTML_DIR, filename)
            text = process_html_file(html_path)

            mention_counts = extract_people(text)

            results.append({
                "title": filename.replace(".html", "").replace("_", " ").title(),
                "author": "Unknown",  
                "url": "Unknown",     
                "mentioned_people": list(mention_counts.keys()),
                "mention_counts": mention_counts,
                "source": "Wired"
            })
        except Exception as e:
            print(f" Failed on {filename}: {e}")
            continue

    with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)

    print(f"\n Done! Processed {len(results)} articles → {OUTPUT_JSON}")


In [None]:
import json

# Paths to input files
NER_RESULTS_FILE = "/content/drive/MyDrive/OIA_NS/NS_people_mentions_ner.json"
ORIGINAL_ARTICLES_FILE = "/content/drive/MyDrive/OIA_NS/newscientist_physics_articles_1400.json"
OUTPUT_FILE = "/content/drive/MyDrive/OIA_NS/NS_people_mentions_final.json"

# Load both files
with open(NER_RESULTS_FILE, "r", encoding="utf-8") as f:
    ner_data = json.load(f)

with open(ORIGINAL_ARTICLES_FILE, "r", encoding="utf-8") as f:
    original_articles = json.load(f)

# Sanity check
assert len(ner_data) == len(original_articles), "Mismatch in number of entries!"

# Merge by index
merged_data = []
for i in range(len(ner_data)):
    ner_data[i]["title"] = original_articles[i]["title"]
    ner_data[i]["author"] = original_articles[i]["author"]
    ner_data[i]["url"] = original_articles[i]["url"]
    merged_data.append(ner_data[i])

# Save output
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(merged_data, f, indent=2, ensure_ascii=False)

print(f"Done! Merged metadata into all {len(merged_data)} records → {OUTPUT_FILE}")
