## Extract Text Data from News Articles

### Setup and Data Load

In [159]:
# Import standard libraries
import random
import time
import os
import pandas as pd
import glob
import json
from typing import Dict

In [19]:
# Import text extraction libraries
from trafilatura import fetch_url, extract
from newspaper import Article as Newspaper3kArticle
import newspaper
import fundus

In [13]:
# Set input and output directories
input_dir = "/Users/tylund/Library/CloudStorage/Dropbox/1. Side Projects/2025.1-Deepfake Threat Landscape/1-data/web-data"

# Construct the pattern using os.path.join for platform-independent path construction
pattern = os.path.join(input_dir, "*.csv") 

# Use glob.glob() to find files matching the pattern
matching_files = glob.glob(pattern)

# Read the files
dataframes = [pd.read_csv(file) for file in matching_files]
df = pd.concat(dataframes, ignore_index=True)

df.head()

Unnamed: 0,Incident_ID,Article_Number,URL,Section_Type
0,AIAAIC2018,1,https://time.com/7290050/veo-3-google-misinfor...,News / Commentary / Analysis
1,AIAAIC2006,1,https://www.indiatoday.in/india/law-news/story...,News / Commentary / Analysis
2,AIAAIC2006,2,https://www.deccanherald.com/india/delhi/delhi...,News / Commentary / Analysis
3,AIAAIC2006,3,https://economictimes.indiatimes.com/tech/arti...,News / Commentary / Analysis
4,AIAAIC2006,4,https://www.ndtv.com/india-news/delhi-hc-passe...,News / Commentary / Analysis


### Define Library Wrappers

In [112]:
# Define canonical column order
COLUMNS = [
    'success',
    'title',
    'date',
    'text',
    'text_length',
    'elapsed_time',
    'source',
    'language'
    'error'  # optional, always present for failures
]

In [160]:
def check_for_blocks(text: str, url: str) -> Dict:
    """Return success=False if blocked by paywall or JS-heavy page."""
    blocked_phrases = [
        "unauthorized access", "subscription required", "javascript is not available",
        "please enable javascript", "you must be logged in", "access denied"
    ]
    if any(phrase.lower() in text.lower() for phrase in blocked_phrases):
        return {"success": False, "error": "blocked_content"}
    return {"success": True}

#### ---- Trafilatura ----

In [141]:
def extract_trafilatura(url: str) -> Dict:
    start = time.time()
    try:
        downloaded = trafilatura.fetch_url(url)
        if not downloaded:
            return {"success": False, "error": "fetch_failed", "source": None, "text": ""}

        text_json = trafilatura.extract(downloaded, output_format="json", with_metadata=True)
        if not text_json:
            return {"success": False, "error": "no_text_extracted", "source": None, "text": ""}

        data = json.loads(text_json)
        text = data.get("text", "") or ""
        source_name = data.get("source-hostname") or None
        date_str = data.get("date") or None
        title = data.get("title") or None
        language = data.get("language") or None

        # Check for JS/paywall blocks
        block_check = check_for_blocks(text, url)
        if not block_check["success"]:
            return {"success": False, "error": block_check["error"], "source": source_name, "text": ""}

        return {
            "success": True,
            "text": text,
            "title": title,
            "date": date_str,
            "source": source_name,
            "language": language,
            "text_length": len(text),
            "elapsed_time": round(time.time() - start, 3),
            "error": ""
        }

    except Exception as e:
        return {"success": False, "error": str(e), "source": None, "text": "", "language": None}

#### ---- Newspaper3k ----

In [152]:
def extract_newspaper3k(url: str) -> Dict:
    start = time.time()
    try:
        article = Newspaper3kArticle(url)
        article.download()
        article.parse()
        text = article.text or ""
        title = article.title or None
        publish_date = article.publish_date
        date_str = publish_date.strftime("%Y-%m-%d") if publish_date else None
        source_name = getattr(article, "meta_site_name", None)
        language = getattr(article, "meta_lang", None)

        # Check for JS/paywall blocks
        block_check = check_for_blocks(text, url)
        if not block_check["success"]:
            return {"success": False, "error": block_check["error"], "source": source_name, "text": "", "language": language}

        return {
            "success": True,
            "text": text,
            "title": title,
            "date": date_str,
            "source": source_name,
            "language": language,
            "text_length": len(text),
            "elapsed_time": round(time.time() - start, 3),
            "error": ""
        }

    except Exception as e:
        return {"success": False, "error": str(e), "source": None, "text": "", "language": None}

#### ---- Newspaper4k ----

In [154]:
def extract_newspaper4k(url: str) -> Dict:
    start = time.time()
    try:
        article = newspaper.Article(url)
        article.download()
        article.parse()

        try:
            text = article.text or ""
            title = article.title or None
            publish_date = getattr(article, "publish_date", None)
            date_str = publish_date.strftime("%Y-%m-%d") if publish_date else None
            source_name = getattr(article, "meta_site_name", None)
            language = getattr(article, "meta_lang", None)
        except Exception as inner_e:
            return {"success": False, "error": f"parse_error: {inner_e}", "source": None, "text": "", "language": None}

        # Check for JS/paywall blocks
        block_check = check_for_blocks(text, url)
        if not block_check["success"]:
            return {"success": False, "error": block_check["error"], "source": source_name, "text": "", "language": language}

        return {
            "success": True,
            "text": text,
            "title": title,
            "date": date_str,
            "source": source_name,
            "language": language,
            "text_length": len(text),
            "elapsed_time": round(time.time() - start, 3),
            "error": ""
        }

    except Exception as e:
        return {"success": False, "error": str(e), "source": None, "text": "", "language": None}

#### Initial tests

In [174]:
# Sample dataframe
sample_df = df.sample(3)

In [175]:
# Trafilatura
results = []

for _, row in sample_df.iterrows():
    url = row['URL']   # replace with your column name
    incident_id = row['Incident_ID']
    
    result = extract_trafilatura(url)
    result['Incident_ID'] = incident_id
    result['URL'] = url
    
    results.append(result)

# Convert results into a dataframe
test_df = pd.DataFrame(results)

test_df

Unnamed: 0,success,text,title,date,source,language,text_length,elapsed_time,error,Incident_ID,URL
0,True,PM Lee Hsien Loong warns public of deepfake cr...,PM Lee Hsien Loong warns public of deepfake cr...,2023-12-29,Yahoo News,,3093.0,1.083,,AIAAIC1270,https://sg.news.yahoo.com/lee-hsien-loong-warn...
1,True,"In December, Mahindra Racing’s Formula E team ...","RIP Racing's First AI Influencer, Who Lived fo...",2024-01-11,The Drive,,3638.0,0.749,,AIAAIC1297,https://www.thedrive.com/news/rip-racings-firs...
2,False,,,,,,,,fetch_failed,AIAAIC1425,https://factcheck.afp.com/doc.afp.com.342A6RJ


In [176]:
# Newspaper3k
results = []

for _, row in sample_df.iterrows():
    url = row['URL']   # replace with your column name
    incident_id = row['Incident_ID']
    
    result = extract_newspaper3k(url)
    result['Incident_ID'] = incident_id
    result['URL'] = url
    
    results.append(result)

# Convert results into a dataframe
test_df = pd.DataFrame(results)

test_df

Unnamed: 0,success,text,title,date,source,language,text_length,elapsed_time,error,Incident_ID,URL
0,True,SINGAPORE — Prime Minister Lee Hsien Loong is ...,PM Lee Hsien Loong warns public of deepfake cr...,2023-12-29,Yahoo News,en,2729,3.846,,AIAAIC1270,https://sg.news.yahoo.com/lee-hsien-loong-warn...
1,True,"The biggest car news and reviews, minus the BS...","RIP Racing’s First AI Influencer, Who Lived fo...",2024-01-11,The Drive,en,3821,1.311,,AIAAIC1297,https://www.thedrive.com/news/rip-racings-firs...
2,True,"""What's missing? English, OK. Arabic, also OK....",Indonesians misled by AI-generated video of pr...,2023-11-14,Fact Check,en,3317,3.24,,AIAAIC1425,https://factcheck.afp.com/doc.afp.com.342A6RJ


In [177]:
# Newspaper4k
results = []

for _, row in sample_df.iterrows():
    url = row['URL']   # replace with your column name
    incident_id = row['Incident_ID']
    
    result = extract_newspaper4k(url)
    result['Incident_ID'] = incident_id
    result['URL'] = url
    
    results.append(result)

# Convert results into a dataframe
test_df = pd.DataFrame(results)

test_df

Unnamed: 0,success,text,title,date,source,language,text_length,elapsed_time,error,Incident_ID,URL
0,True,SINGAPORE — Prime Minister Lee Hsien Loong is ...,PM Lee Hsien Loong warns public of deepfake cr...,2023-12-29,Yahoo News,en,2729,1.212,,AIAAIC1270,https://sg.news.yahoo.com/lee-hsien-loong-warn...
1,True,"The biggest car news and reviews, minus the BS...","RIP Racing’s First AI Influencer, Who Lived fo...",2024-01-11,The Drive,en,3821,0.354,,AIAAIC1297,https://www.thedrive.com/news/rip-racings-firs...
2,True,"""What's missing? English, OK. Arabic, also OK....",Indonesians misled by AI-generated video of pr...,2023-11-14,Fact Check,en,3317,1.072,,AIAAIC1425,https://factcheck.afp.com/doc.afp.com.342A6RJ


### Data Extraction