In [None]:
import os, re, json, random, time, asyncio, sys
import pandas as pd
import tldextract
import trafilatura
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urlunparse, parse_qs, urlencode, urljoin
from tqdm import tqdm

INPUT_CSV   = "mentions_original_part1.csv" 
OUTPUT_CSV  = "mentions_original_part1_fill.csv" 

MAX_CONCURRENCY              = 50
MAX_CONCURRENCY_PER_HOST     = 6
REQUEST_TIMEOUT              = 25
CONNECT_TIMEOUT              = 8
RETRY_MAX                    = 3
BACKOFF_BASE                 = 1.5
SLEEP_BETWEEN_BATCHES        = (0.0, 0.0)

USER_AGENT = "USyd-Altmetric-4.4/async/1.0"
SUSPECT_HOSTS = {
    "forbetterscience.com",
    "retractionwatch.com",
    "whyevolutionistrue.com",
    "rawnews.com",
    "ct.moreover.com",
}
ALLOW_CROSS_DOMAIN_HOSTS = {
    "ct.moreover.com", "feedproxy.google.com", "feedburner.com",
}

EXTRA_HEADERS = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
    "Cache-Control": "no-cache",
    "Pragma": "no-cache",
    "Upgrade-Insecure-Requests": "1",
}

FORCE_SAME_DOMAIN_FINAL_URL = True

MIN_TEXT_LEN_FOR_OK   = 20
HARD_MAX_TEXT_CHARS   = 50000

REFETCH_BAD = True
RETRY_STATUS_SET = {403, 429, 410, 500, 502, 503, 504}

MIN_DIMENSION_PX   = 120
FILENAME_NEG_PAT   = re.compile(r"(logo|icon|sprite|avatar|ads?|banner|pixel|track|spacer|blank)", re.I)
CLASS_NEG_PAT      = re.compile(r"(logo|icon|avatar|ads?|banner|breadcrumb|nav|footer|header)", re.I)
WP_POS_PAT         = re.compile(r"(wp-post-image|entry-content|size-\w+)", re.I)
AROUND_TEXT_CHARS  = 40
KEEP_OGIMAGE_IF_SAME_DOMAIN = True

if 'ipykernel' in sys.modules:
    try:
        import nest_asyncio
        nest_asyncio.apply()
    except Exception:
        pass

def normalize_url(u: str) -> str:
    if not isinstance(u, str): return ""
    s = u.strip()
    if not s: return ""
    try:
        uo = urlparse(s)
        if uo.scheme.lower() not in {"http","https"}: return ""
        q = parse_qs(uo.query, keep_blank_values=True)
        for k in list(q.keys()):
            lk = k.lower()
            if lk.startswith("utm_") or lk in {"fbclid","gclid"}:
                q.pop(k, None)
        new_q = urlencode(q, doseq=True)
        path = uo.path[:-1] if uo.path.endswith("/") else uo.path
        return urlunparse((uo.scheme.lower(), uo.netloc.lower(), path, uo.params, new_q, ""))
    except Exception:
        return ""

def host_of(u):
    try:
        return urlparse(u).netloc.split(":")[0].lower()
    except Exception:
        return ""

def same_domain(u1, u2):
    try:
        a = urlparse(u1).netloc.split(":")[0].lower()
        b = urlparse(u2).netloc.split(":")[0].lower()
        return a == b or a.endswith(b) or b.endswith(a)
    except Exception:
        return False

def pick_article_container(soup: BeautifulSoup):
    node = soup.find("article")
    if node: return node
    node = soup.find("main")
    if node: return node
    for kw in ["article","post","story","content","entry","article-body","post-content","rich-text"]:
        cand = soup.find("div", class_=re.compile(kw, re.I)) or soup.find("section", class_=re.compile(kw, re.I))
        if cand: return cand
    return soup.body or soup

def nearby_text_len(el):
    total = 0
    if el.parent:
        total += len(el.parent.get_text(separator=" ", strip=True) or "")
    for sib in list(el.previous_siblings)[:2] + list(el.next_siblings)[:2]:
        try:
            if hasattr(sib, "get_text"):
                total += len(sib.get_text(separator=" ", strip=True) or "")
        except Exception:
            pass
    return total

def parse_max_srcset_width(srcset_val: str):
    try:
        widths = []
        for part in srcset_val.split(","):
            part = part.strip()
            m = re.search(r"\s(\d+)w", part)
            if m:
                widths.append(int(m.group(1)))
        return max(widths) if widths else None
    except Exception:
        return None

def extract_relevant_images(html: str, base_url: str):
    try:
        soup = BeautifulSoup(html, "lxml")
    except Exception:
        return []
    article = pick_article_container(soup)
    out, seen = [], set()
    metas = []
    for sel in ["meta[property='og:image']", "meta[name='twitter:image']"]:
        for m in soup.select(sel):
            u = (m.get("content") or "").strip()
            if not u: continue
            u = urljoin(base_url, u)
            metas.append(u)
    def _same_domain(u):
        try:
            a = urlparse(base_url).netloc.split(":")[0].lower()
            b = urlparse(u).netloc.split(":")[0].lower()
            return a == b or a.endswith(b) or b.endswith(a)
        except Exception:
            return False
    for u in metas:
        fn = u.split("/")[-1].lower()
        if FILENAME_NEG_PAT.search(fn):
            continue
        if KEEP_OGIMAGE_IF_SAME_DOMAIN and not _same_domain(u):
            continue
        if u not in seen:
            seen.add(u); out.append(u)
    for img in article.find_all("img"):
        src = (img.get("src") or img.get("data-src") or img.get("data-original") or "").strip()
        if not src: continue
        u = urljoin(base_url, src)
        if not u.startswith("http"): continue
        fn = u.split("/")[-1].lower()
        if FILENAME_NEG_PAT.search(fn): continue
        cls = " ".join(img.get("class", [])).lower()
        if CLASS_NEG_PAT.search(cls) and not WP_POS_PAT.search(cls):
            continue
        def parse_dim(v):
            try:
                v = str(v).lower().strip().replace("px","")
                return int(re.sub(r"[^\d]", "", v)) if re.search(r"\d", v) else None
            except Exception:
                return None
        w = parse_dim(img.get("width"))
        h = parse_dim(img.get("height"))
        if not w and img.get("srcset"):
            w = parse_max_srcset_width(img.get("srcset"))
        if (w and w < MIN_DIMENSION_PX) or (h and h < MIN_DIMENSION_PX):
            alt = (img.get("alt") or "").strip()
            if len(alt) < 12 and not img.find_parent("a"):
                continue
        has_caption = bool(img.find_parent("figure") and img.find_parent("figure").find("figcaption"))
        near_text = nearby_text_len(img) >= AROUND_TEXT_CHARS
        alt_long  = len((img.get("alt") or "").strip()) >= 12
        anchor_wrapped = bool(img.find_parent("a"))
        if not (has_caption or near_text or alt_long or anchor_wrapped or WP_POS_PAT.search(cls)):
            continue
        if u not in seen:
            seen.add(u); out.append(u)
    return out

JS_REDIRECT_PAT = re.compile(r"""(?:window|document)\.location(?:\.href)?\s*=\s*['"]([^'"]+)['"]""", re.I)
META_REFRESH_PAT = re.compile(r"^\s*\d+\s*;\s*url\s*=\s*(.+)\s*$", re.I)

def discover_target_url(html: str, base_url: str) -> str:
    try:
        soup = BeautifulSoup(html, "lxml")
    except Exception:
        return ""
    canon = soup.find("link", rel=lambda v: v and "canonical" in v.lower())
    if canon and canon.get("href"):
        return urljoin(base_url, canon.get("href").strip())
    ogu = soup.find("meta", attrs={"property":"og:url"})
    if ogu and ogu.get("content"):
        return urljoin(base_url, ogu.get("content").strip())
    mref = soup.find("meta", attrs={"http-equiv": lambda v: v and v.lower()=="refresh"})
    if mref and mref.get("content"):
        c = mref.get("content").strip()
        m = META_REFRESH_PAT.match(c)
        if m:
            return urljoin(base_url, m.group(1).strip())
    for s in soup.find_all("script"):
        txt = s.string or ""
        m = JS_REDIRECT_PAT.search(txt)
        if m:
            return urljoin(base_url, m.group(1).strip())
    return ""

import aiohttp
from aiohttp import ClientTimeout

global_semaphore = asyncio.Semaphore(MAX_CONCURRENCY)
host_semaphores = {}

def get_host_sema(url):
    host = host_of(url)
    limit = 2 if host in SUSPECT_HOSTS else MAX_CONCURRENCY_PER_HOST
    if host not in host_semaphores:
        host_semaphores[host] = asyncio.Semaphore(limit)
    return host_semaphores[host]

def build_timeout():
    return ClientTimeout(total=None, connect=CONNECT_TIMEOUT, sock_connect=CONNECT_TIMEOUT, sock_read=REQUEST_TIMEOUT)

async def fetch_one(session, url):
    err = ""; final_url = url; status = None; html = None
    host = host_of(url)
    for attempt in range(1, RETRY_MAX+1):
        try:
            async with global_semaphore, get_host_sema(url):
                hdrs = {"User-Agent": USER_AGENT, **EXTRA_HEADERS}
                if attempt == RETRY_MAX and host in SUSPECT_HOSTS:
                    hdrs["Referer"] = f"https://{host}/"
                async with session.get(url, allow_redirects=True, timeout=build_timeout(), headers=hdrs) as resp:
                    status = resp.status
                    final_url = str(resp.url)
                    ctype = resp.headers.get("Content-Type","")
                    if status == 200 and "text/html" in ctype:
                        html = await resp.text(errors="ignore")
                        return status, html, final_url, ""
                    if status in RETRY_STATUS_SET and attempt < RETRY_MAX:
                        await asyncio.sleep((BACKOFF_BASE ** attempt) + random.random())
                        continue
                    return status, None, final_url, f"http_{status}"
        except Exception as e:
            err = str(e)
        await asyncio.sleep((BACKOFF_BASE ** attempt) * 0.5 + random.random())
    return status, html, final_url, (err or f"http_{status}")

async def fetch_all(urls):
    results = {}
    connector = aiohttp.TCPConnector(limit=MAX_CONCURRENCY, ssl=False)
    async with aiohttp.ClientSession(connector=connector, cookie_jar=aiohttp.DummyCookieJar()) as session:
        async def fetch_with_url(u):
            status, html, final_url, err = await fetch_one(session, u)
            return u, status, html, final_url, err
        tasks = [asyncio.create_task(fetch_with_url(u)) for u in urls]
        for fut in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Fetching"):
            u, status, html, final_url, err = await fut
            results[u] = (status, html, final_url, err)
    return results

from readability import Document

def extract_text_strong(html: str, base_url: str):
    text0 = trafilatura.extract(html, include_comments=False, include_tables=False, no_fallback=True) or ""
    method = "trafilatura"
    if len(text0) < MIN_TEXT_LEN_FOR_OK:
        try:
            doc = Document(html)
            content_html = doc.summary(html_partial=True)
            if content_html:
                soup = BeautifulSoup(content_html, "lxml")
                t2 = soup.get_text(separator="\n", strip=True)
                if len(t2) > len(text0):
                    text0 = t2
                    method = "readability"
        except Exception:
            pass
    full_len = len(text0)
    truncated = 0
    text = text0
    if full_len > HARD_MAX_TEXT_CHARS:
        text = text0[:HARD_MAX_TEXT_CHARS]
        truncated = 1
    return text, len(text), method, truncated, full_len

df = pd.read_csv(INPUT_CSV, dtype=str, keep_default_na=False, na_values=[])
if "Mention URL" not in df.columns:
    raise SystemExit("The input is missing the 'Mention URL' column.")

df["Mention URL"] = df["Mention URL"].astype(str).str.strip()
df["norm_url"] = df["Mention URL"].map(normalize_url)
valid = df[df["norm_url"] != ""].copy()

unique_urls = sorted(valid["norm_url"].unique())
print(f"Unique URLs to fetch: {len(unique_urls)}")

html_map = await fetch_all(unique_urls)

cache = {}
bad_urls = []

for u in tqdm(unique_urls, desc="Parsing"):
    status, html, final_url, err = html_map.get(u, (None, None, u, "no_result"))
    if status == 200 and html and FORCE_SAME_DOMAIN_FINAL_URL:
        if host_of(u) not in ALLOW_CROSS_DOMAIN_HOSTS and not same_domain(u, final_url):
            final_url = u
    if status != 200 or not html:
        cache[u] = dict(
            final_url=final_url, http_status=status, error=err,
            domain="", page_title="", article_title="",
            text="", text_len=0, text_len_full=0,
            extraction_method="none", text_truncated=0,
            images_json="[]", images_count=0
        )
        if status in RETRY_STATUS_SET or (status == 200 and not html):
            bad_urls.append(u)
        continue
    try:
        soup = BeautifulSoup(html, "lxml")
    except Exception:
        soup = None
    page_title, article_title = "", ""
    if soup:
        if soup.title and soup.title.string:
            page_title = soup.title.string.strip()
        else:
            ogt = soup.select_one("meta[property='og:title']")
            if ogt and ogt.get("content"):
                page_title = ogt.get("content").strip()
        h1 = soup.find("h1")
        if h1 and h1.get_text(strip=True):
            article_title = h1.get_text(strip=True)
        elif not article_title:
            ogt = soup.select_one("meta[property='og:title']")
            if ogt and ogt.get("content"):
                article_title = ogt.get("content").strip()
    text, text_len, extract_method, truncated, text_len_full = extract_text_strong(html, final_url)
    images = extract_relevant_images(html, final_url)
    if text_len < MIN_TEXT_LEN_FOR_OK:
        target = discover_target_url(html, final_url)
        if target and normalize_url(target) != normalize_url(final_url):
            html_t = trafilatura.fetch_url(target)
            if html_t:
                t2, tl2, m2, tr2, fl2 = extract_text_strong(html_t, target)
                if tl2 > text_len:
                    text, text_len, extract_method, truncated, text_len_full = t2, tl2, m2, tr2, fl2
                    images2 = extract_relevant_images(html_t, target)
                    if len(images2) >= len(images):
                        images = images2
                    final_url = target
                    try:
                        soup2 = BeautifulSoup(html_t, "lxml")
                        if soup2.title and soup2.title.string:
                            page_title = soup2.title.string.strip() or page_title
                        h1 = soup2.find("h1")
                        if h1 and h1.get_text(strip=True):
                            article_title = h1.get_text(strip=True)
                    except Exception:
                        pass
    ext = tldextract.extract(final_url)
    domain = ".".join([p for p in [ext.domain, ext.suffix] if p])
    cache[u] = dict(
        final_url=final_url, http_status=status, error="",
        domain=domain, page_title=page_title, article_title=article_title,
        text=text, text_len=text_len, text_len_full=text_len_full,
        extraction_method=extract_method, text_truncated=truncated,
        images_json=json.dumps(images, ensure_ascii=False), images_count=len(images)
    )
    if text_len < MIN_TEXT_LEN_FOR_OK:
        bad_urls.append(u)

if REFETCH_BAD and bad_urls:
    bad_urls = sorted(set(bad_urls))
    print(f"Refetching fallback for {len(bad_urls)} URLs via trafilatura.fetch_url ...")
    for u in tqdm(bad_urls, desc="Fallback"):
        res = cache.get(u, {})
        if res and res.get("text_len", 0) >= MIN_TEXT_LEN_FOR_OK:
            continue
        try:
            html2 = trafilatura.fetch_url(u)
            if html2:
                t2, tl2, m2, tr2, fl2 = extract_text_strong(html2, u)
                images2 = extract_relevant_images(html2, u)
                page_title, article_title = res.get("page_title",""), res.get("article_title","")
                try:
                    soup2 = BeautifulSoup(html2, "lxml")
                    if soup2.title and soup2.title.string:
                        page_title = soup2.title.string.strip() or page_title
                    h1 = soup2.find("h1")
                    if h1 and h1.get_text(strip=True):
                        article_title = h1.get_text(strip=True)
                except Exception:
                    pass
                ext = tldextract.extract(u)
                domain2 = ".".join([p for p in [ext.domain, ext.suffix] if p])
                cache[u] = dict(
                    final_url=u if (FORCE_SAME_DOMAIN_FINAL_URL and host_of(u) not in ALLOW_CROSS_DOMAIN_HOSTS) else (res.get("final_url", u) or u),
                    http_status=200, error=res.get("error",""),
                    domain=domain2, page_title=page_title, article_title=article_title,
                    text=t2, text_len=tl2, text_len_full=fl2,
                    extraction_method=m2, text_truncated=tr2,
                    images_json=json.dumps(images2, ensure_ascii=False), images_count=len(images2)
                )
        except Exception:
            pass

cols_add = ["final_url","http_status","error","domain","page_title","article_title","text","text_len","text_len_full","extraction_method","text_truncated","images_json","images_count"]
for c in cols_add:
    if c not in valid.columns:
        valid[c] = ""

for idx, row in tqdm(list(valid.iterrows()), total=len(valid), desc="Filling"):
    res = cache.get(row["norm_url"])
    if res:
        for c in cols_add:
            valid.at[idx, c] = res[c]

for old in ["top_image"]:
    if old in valid.columns:
        valid.drop(columns=[old], inplace=True, errors="ignore")

df_out = pd.concat([valid, df[df["norm_url"] == ""]], ignore_index=True)
df_out.to_csv(OUTPUT_CSV, index=False)

unique_ok = sum(1 for u in unique_urls if cache.get(u, {}).get("http_status") == 200 and cache.get(u, {}).get("text_len",0) >= MIN_TEXT_LEN_FOR_OK)
print("\n=== Done ===")
print(f"Unique URLs: {len(unique_urls)}")
print(f"OK (len(text) >= {MIN_TEXT_LEN_FOR_OK}): {unique_ok}/{len(unique_urls)}")
print(f"Output: {OUTPUT_CSV}")