In [None]:
!pip install -U gliner

In [None]:
from gliner import GLiNER

model = GLiNER.from_pretrained("urchade/gliner_large-v2").to("cuda")


In [None]:
text = """
 “Nothing is getting regulated right now,” says Orme-Zavaleta.
 “Reorganization takes a lot of time. They’re here for a four-year stint,
 and not much is going to happen other than rollbacks. That’s part of the strategy:
 Have people leave, and upend things so much that nothing is getting done.”
"""
results = model.predict_entities(text, labels=["person"])

# Filter and count person entities
from collections import defaultdict
mention_counts = defaultdict(int)
for r in results:
    if r["label"] == "person":
        mention_counts[r["text"]] += 1

print(mention_counts)


In [None]:
import json
import os
import time
from collections import defaultdict
from tqdm import tqdm
from gliner import GLiNER
import requests
from bs4 import BeautifulSoup

#  Load GLiNER model 
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
model = GLiNER.from_pretrained("urchade/gliner_large-v2")
model = model.to("cuda" if model.device.type != "cuda" else model.device)

#  Extract people entities using GLiNER 
def extract_people_from_text(text):
    mention_counts = defaultdict(int)
    try:
        entities = model.predict_entities(text, labels=["person"])
        for ent in entities:
            mention_counts[ent["text"].strip()] += 1
    except Exception as e:
        print(f"⚠️ NER extraction failed: {e}")
    return dict(mention_counts)

#  Process each article 
def extract_article_people(article, max_retries=3):
    for attempt in range(max_retries):
        try:
            res = requests.get(article["url"], timeout=15, verify=False)
            soup = BeautifulSoup(res.content, "html.parser")
            text = " ".join(p.get_text(separator=" ") for p in soup.find_all("p"))
            print(text, "this is the text")
            mention_counts = extract_people_from_text(text)
            return {
                "author": article["author"],
                "title": article["title"],
                "url": article["url"],
                "mentioned_people": list(mention_counts.keys()),
                "mention_counts": mention_counts,
                "source": "Wired"
            }
        except Exception as e:
            print(f"Error fetching {article['url']}: {e} (attempt {attempt+1}/{max_retries})")
            time.sleep(2)
    return None

#  Main pipeline 
if __name__ == "__main__":
    with open("/content/drive/MyDrive/OIA_weird/wired_science_articles_3.json", "r", encoding="utf-8") as f:
        articles = json.load(f)

    results = []
    for article in tqdm(articles, desc="Processing articles with GLiNER", ncols=100):
        result = extract_article_people(article)
        if result:
            results.append(result)
        time.sleep(0.5)  # polite delay

    with open("/content/drive/MyDrive/OIA_weird/wired_people_mentions_gliner_3.json", "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)

    print(f"\n Done! Saved {len(results)} enriched articles to 'wired_people_mentions_gliner.json'.")


In [None]:
#changing into chunks

In [None]:
import json
import requests
from bs4 import BeautifulSoup
from collections import defaultdict
from gliner import GLiNER
from tqdm import tqdm

#  Load GLiNER model 
model = GLiNER.from_pretrained("urchade/gliner_large-v2")
model.to("cuda" if model.device.type == "cuda" else "cpu")

#  Helper: Chunk long text into sentences under token limit 
def chunk_text(text, max_words=384):
    import re
    sentences = re.split(r'(?<=[.!?])\s+', text)
    chunks, chunk = [], ""
    for sentence in sentences:
        if len((chunk + sentence).split()) < max_words:
            chunk += sentence + " "
        else:
            chunks.append(chunk.strip())
            chunk = sentence + " "
    if chunk:
        chunks.append(chunk.strip())
    return chunks

#  Helper: Process a single article 
def process_article(article):
    try:
        res = requests.get(article["url"], timeout=15, verify=False)
        soup = BeautifulSoup(res.content, "html.parser")

        # Extract only <p class="paywall">
        paywall_paragraphs = soup.find_all("p", class_="paywall")
        text = " ".join(p.get_text(" ", strip=True) for p in paywall_paragraphs)

        # Chunk the content to avoid truncation
        mention_counts = defaultdict(int)
        for chunk in chunk_text(text):
            results = model.predict_entities(chunk, labels=["person"])
            for ent in results:
                if ent["label"].lower() == "person":
                    mention_counts[ent["text"].strip()] += 1  # mention count

        return {
            "title": article["title"],
            "author": article["author"],
            "url": article["url"],
            "mentioned_people": list(mention_counts.keys()),
            "mention_counts": dict(mention_counts),
            "source": "New Scientist"
        }

    except Exception as e:
        print(f"Failed: {article['url']} — {e}")
        return None


#  Main script 
if __name__ == "__main__":
    with open("/content/drive/MyDrive/OIA_weird/wired_science_articles_3.json", "r", encoding="utf-8") as f:
        articles = json.load(f)

    enriched = []
    for art in tqdm(articles, desc="Processing articles with GLiNER"):
        result = process_article(art)
        if result and result["mentioned_people"]:
            enriched.append(result)

    with open("/content/drive/MyDrive/OIA_weird/wired_people_mentions_gliner_311.json", "w", encoding="utf-8") as f:
        json.dump(enriched, f, indent=2, ensure_ascii=False)

    print(f"\n Done! Saved {len(enriched)} enriched articles to 'NS_people_mentions_gliner.json'")


In [None]:
text = """
 “Nothing is getting regulated right now,” says Orme-Zavaleta.
 “Reorganization takes a lot of time. They’re here for a four-year stint,
 and not much is going to happen other than rollbacks. That’s part of the strategy:
 Have people leave, and upend things so much that nothing is getting done.”
"""

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)
example = text

ner_results = nlp(example)
print(ner_results)


In [None]:
import langchain
from transformers import pipeline

def extract_names(document):
    # LangChain initialization
    lc = langchain.LangChain()

    # Tokenize the document into sentences
    sentences = document.split('.')

    names = []
    for sentence in sentences:
        embeddings = lc.embed_text(sentence)

        # Implement your logic for name extraction using embeddings
        # For demonstration purposes, let's assume we're extracting names longer than 5 characters
        extracted_tokens = [token for token, embedding in zip(sentence.split(), embeddings) if len(token) > 5]
        names.extend(extracted_tokens)

    return names

def extract_names_with_hugging_face_pipelines(document):
    # Using LangChain for text embedding
    extracted_names = extract_names(document)

    # Using Hugging Face pipeline for DistilBERT
    ner_pipeline = pipeline("ner", model="distilbert-base-uncased", tokenizer="distilbert-base-uncased")

    names = []
    for name_token in extracted_names:
        # Apply NER using pipeline
        entities = ner_pipeline(name_token)

        if entities and entities[0]['entity'] == 'B-PER':  # Checking if it's a person entity
            names.append(name_token)

    return names

document = "John Smith and Sarah Johnson attended the conference. Emily Brown was the keynote speaker."
extracted_names = extract_names_with_hugging_face_pipelines(document)
print("Extracted Names:", extracted_names)

In [None]:
!pip install LangChain

In [None]:
import json
import time
import requests
from bs4 import BeautifulSoup
from collections import defaultdict
from tqdm import tqdm
import textwrap
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# Load stronger NER model
model_name = "Jean-Baptiste/roberta-large-ner-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Utility: Chunk long text into smaller windows for token limit
def chunk_text(text, max_length=400):
    return textwrap.wrap(text, max_length)

# Core NER logic
def extract_people(text):
    mention_counts = defaultdict(int)
    chunks = chunk_text(text, max_length=400)
    for chunk in chunks:
        try:
            results = ner_pipeline(chunk)
            for ent in results:
                if ent["entity_group"] == "PER":
                    name = ent["word"].strip()
                    mention_counts[name] += 1
        except Exception as e:
            print(f"NER error in chunk: {e}")
    return dict(mention_counts)

# Article processor
def extract_article_people(article, max_retries=3):
    for attempt in range(max_retries):
        try:
            res = requests.get(article["url"], timeout=15, verify=False)
            soup = BeautifulSoup(res.content, "html.parser")
            paragraphs = soup.find_all("p", class_="paywall") or soup.find_all("p")
            text = " ".join(p.get_text(separator=" ") for p in paragraphs)
            mention_counts = extract_people(text)
            return {
                "title": article["title"],
                "author": article["author"],
                "url": article["url"],
                "mentioned_people": list(mention_counts.keys()),
                "mention_counts": mention_counts,
                "source": "Wired"  # or "New Scientist"
            }
        except Exception as e:
            print(f" Failed: {article['url']} (Attempt {attempt + 1}) | {e}")
            time.sleep(2)
    return None

#  Main Pipeline 
if __name__ == "__main__":
    INPUT_JSON = "/content/drive/MyDrive/OIA_weird/wired_science_articles_3.json"
    OUTPUT_JSON = "/content/drive/MyDrive/OIA_weird/wired_people_mentions_robust_3.json"

    with open(INPUT_JSON, "r", encoding="utf-8") as f:
        articles = json.load(f)

    results = []
    for article in tqdm(articles, desc="Extracting NER", ncols=100):
        enriched = extract_article_people(article)
        if enriched:
            results.append(enriched)
        time.sleep(0.5)  # polite delay

    with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)

    print(f"\n Done! Extracted {len(results)} articles -> {OUTPUT_JSON}")


In [1]:
#from htmls

In [None]:
import os
import json
import time
import textwrap
from bs4 import BeautifulSoup
from collections import defaultdict
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

#  Hugging Face Model 
model_name = "Jean-Baptiste/roberta-large-ner-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

#  Utility: Chunk text to avoid model length overflow 
def chunk_text(text, max_length=400):
    return textwrap.wrap(text, max_length)

def extract_people(text):
    mention_counts = defaultdict(int)
    chunks = chunk_text(text, max_length=400)
    for chunk in chunks:
        try:
            results = ner_pipeline(chunk)
            for ent in results:
                if ent["entity_group"] == "PER":
                    name = ent["word"].strip()
                    mention_counts[name] += 1
        except Exception as e:
            print(f"NER error: {e}")
    return dict(mention_counts)

#  Core function: extract from saved HTML file 
def process_html_file(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")

    paragraphs = soup.find_all("p", class_="paywall")
    if not paragraphs:  # fallback if paywall not found
        paragraphs = soup.find_all("p")

    text = " ".join(p.get_text(strip=True) for p in paragraphs)
    return text

#  Main pipeline 
if __name__ == "__main__":
    HTML_DIR = "/content/drive/MyDrive/OIA_weird/htmls/wired_rendered_html/"  # mounting drive folder here
    OUTPUT_JSON = "/content/drive/MyDrive/OIA_weird/wired_people_mentions_ner.json"

    results = []

    for i, filename in enumerate(tqdm(sorted(os.listdir(HTML_DIR)), desc="Processing HTMLs", ncols=100)):
        if not filename.endswith(".html"):
            continue

        try:
            html_path = os.path.join(HTML_DIR, filename)
            text = process_html_file(html_path)

            mention_counts = extract_people(text)

            results.append({
                "title": filename.replace(".html", "").replace("_", " ").title(),
                "author": "Unknown",  # You can fix this if stored separately
                "url": "Unknown",     # You can map this back from index if needed
                "mentioned_people": list(mention_counts.keys()),
                "mention_counts": mention_counts,
                "source": "Wired"
            })
        except Exception as e:
            print(f"Failed on {filename}: {e}")
            continue

    with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)

    print(f"\n Done! Processed {len(results)} articles → {OUTPUT_JSON}")


In [3]:
#merging the title and author with mentionaed names

In [None]:
import json

# Paths to input files
NER_RESULTS_FILE = "/content/drive/MyDrive/OIA_weird/wired_people_mentions_ner.json"
ORIGINAL_ARTICLES_FILE = "/content/drive/MyDrive/OIA_weird/wired_science_articles_1379.json"
OUTPUT_FILE = "/content/drive/MyDrive/OIA_weird/wired_people_mentions_final.json"

# Load both files
with open(NER_RESULTS_FILE, "r", encoding="utf-8") as f:
    ner_data = json.load(f)

with open(ORIGINAL_ARTICLES_FILE, "r", encoding="utf-8") as f:
    original_articles = json.load(f)

# Sanity check
assert len(ner_data) == len(original_articles), "Mismatch in number of entries!"

# Merge by index
merged_data = []
for i in range(len(ner_data)):
    ner_data[i]["title"] = original_articles[i]["title"]
    ner_data[i]["author"] = original_articles[i]["author"]
    ner_data[i]["url"] = original_articles[i]["url"]
    merged_data.append(ner_data[i])

# Save output
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(merged_data, f, indent=2, ensure_ascii=False)

print(f" Done! Merged metadata into all {len(merged_data)} records → {OUTPUT_FILE}")
