## Extract Text Data from News Articles

### Setup and Data Load

In [12]:
# Import standard libraries
import random
import time
import os
import pandas as pd
import glob
import json
from typing import Dict
import openpyxl
import requests
from tqdm import tqdm

In [3]:
# Import text extraction libraries
from newspaper import Article as newspaper3kArticle
import newspaper as newspaper4k

In [25]:
# Set input and output directories
deepfake_path = "/Users/tylund/Library/CloudStorage/Dropbox/1. Side Projects/2025.1-Deepfake Threat Landscape/1-data/deepfake-incidents"
repo_path = "/Users/tylund/Library/CloudStorage/Dropbox/1. Side Projects/2025.1-Deepfake Threat Landscape/1-data/incident-repos"
webdata_path = "/Users/tylund/Library/CloudStorage/Dropbox/1. Side Projects/2025.1-Deepfake Threat Landscape/1-data/web-data"

In [5]:
# Read the files
aaic_deepfakes = pd.read_csv(os.path.join(deepfake_path, 'aaic_gpt_results.csv'))
aaic_repo = pd.read_excel(os.path.join(repo_path, 'AIAAIC-repository-09042025.xlsx'),
                         sheet_name='Incidents',
                         skiprows=1)

aaic_repo = aaic_repo[['AIAAIC ID#', 'Summary/links']].rename(columns = {'AIAAIC ID#': 'incident_id',
                                                                         'Summary/links': 'URL'
                                                                        })

In [6]:
# Construct dataframe of incident IDs and URLs
cols_to_drop = ['is_deepfake', 'comment', 'gpt_classification', 'match']

# Select incidents that passed screening
aaic_deepfakes = aaic_deepfakes[aaic_deepfakes['final']].drop(cols_to_drop, axis=1)

# Merge URLs
aaic_deepfakes = pd.merge(aaic_deepfakes, aaic_repo, on='incident_id', how='left')

### Define Library Wrappers

In [7]:
# Define canonical column order
COLUMNS = [
    'success',
    'title',
    'text',
    'text_length',
    'elapsed_time',
    'error'  # optional, always present for failures
]

In [8]:
def check_for_blocks(text: str, url: str) -> Dict:
    """Return success=False if blocked by paywall or JS-heavy page."""
    blocked_phrases = [
        "unauthorized access", "subscription required", "javascript is not available",
        "please enable javascript", "you must be logged in", "access denied"
    ]
    if any(phrase.lower() in text.lower() for phrase in blocked_phrases):
        return {"success": False, "error": "blocked_content"}
    return {"success": True}

#### ---- Newspaper3k ----

In [29]:
def extract_newspaper3k(url: str) -> Dict:
    start = time.time()
    try:
        article = newspaper3kArticle(url)
        article.download()
        article.parse()
        text = article.text or ""
        title = article.title or None

        # Check for JS/paywall blocks
        block_check = check_for_blocks(text, url)
        if not block_check["success"]:
            return {"success": False, "error": block_check["error"], "text": ""}

        return {
            "success": True,
            "text": text,
            "title": title,
            "text_length": len(text),
            "elapsed_time": round(time.time() - start, 3),
            "error": ""
        }

    except Exception as e:
        return {"success": False, "error": str(e), "text": ""}

#### ---- Newspaper4k ----

In [9]:
def extract_newspaper4k(url: str) -> Dict:
    start = time.time()
    try:
        article = newspaper4k.Article(url)
        article.download()
        article.parse()

        try:
            text = article.text or ""
            title = article.title or None
        except Exception as inner_e:
            return {"success": False, "error": f"parse_error: {inner_e}", "text": ""}

        # Check for JS/paywall blocks
        block_check = check_for_blocks(text, url)
        if not block_check["success"]:
            return {"success": False, "error": block_check["error"], "text": ""}

        return {
            "success": True,
            "text": text,
            "title": title,
            "text_length": len(text),
            "elapsed_time": round(time.time() - start, 3),
            "error": ""
        }

    except Exception as e:
        return {"success": False, "error": str(e), "text": ""}

#### Initial tests

In [31]:
# Sample dataframe
sample_df = aaic_deepfakes.sample(3)

In [35]:
# Newspaper3k
results = []

for _, row in sample_df.iterrows():
    url = row['URL']
    incident_id = row['incident_id']
    
    result = extract_newspaper3k(url)
    result['incident_id'] = incident_id
    result['URL'] = url
    
    results.append(result)

# Convert results into a dataframe
test_newspaper3k = pd.DataFrame(results)

test_newspaper3k.to_csv(os.path.join(out_path, 'test_newspaper3k.csv'), index=False)

In [38]:
# Newspaper4k
results = []

for _, row in sample_df.iterrows():
    url = row['URL']
    incident_id = row['incident_id']
    
    result = extract_newspaper4k(url)
    result['incident_id'] = incident_id
    result['URL'] = url
    
    results.append(result)

# Convert results into a dataframe
test_newspaper4k = pd.DataFrame(results)

test_newspaper4k.to_csv(os.path.join(out_path, 'test_newspaper4k.csv'), index=False)

## Extract Articles

In [10]:
# --- setup requests session ---
session = requests.Session()
session.headers.update({
    "User-Agent": "Mozilla/5.0 (compatible; DeepfakeScraper/1.0; +https://github.com/treinmund/deepfake-threatlandscape)"
})

In [14]:
# Newspaper4k
results = []

for _, row in tqdm(aaic_deepfakes.iterrows(), total=aaic_deepfakes.shape[0], desc="Scraping articles"):
    url = row['URL']
    incident_id = row['incident_id']
    
    result = extract_newspaper4k(url)
    result['incident_id'] = incident_id
    result['URL'] = url

    time.sleep(0.5)  # polite delay
    
    results.append(result)

# Convert results into a dataframe
aaic_webpages = pd.DataFrame(results)

Scraping articles: 100%|██████████████████████| 198/198 [12:04<00:00,  3.66s/it]


In [18]:
# Check for web scraping errors
aaic_webpages[aaic_webpages['error'].isnull()]

Unnamed: 0,success,text,title,text_length,elapsed_time,error,incident_id,URL


In [27]:
# Save webpage data
aaic_webpages.to_csv(os.path.join(webdata_path, 'aaic_webpages.csv'), index=False, encoding='utf-8')