In [1]:
import pandas as pd
from urllib.parse import urlparse
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
from tqdm.notebook import tqdm
import requests
import os
from typing import List, Tuple
import numpy as np

In [14]:
def is_excluded_url(url: str) -> int:
    """
    Returns 1 if the URL should be excluded based on file extension or known static filename; else 0.
    """
    excluded_exts = [
        "pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx",   # Office files
        "txt", "csv", "tsv", "rtf", "xml", "json", "md",       # Text/data
        "jpg", "jpeg", "png", "gif", "bmp", "svg", "webp",    # Images
        "mp3", "mp4", "avi", "mov", "wmv", "mkv", "webm",     # Media
        "zip", "rar", "gz", "tar", "7z",                      # Archives
        "exe", "bin", "iso", "apk", "dmg", "msi",             # Binaries
        "css", "js"                                           # Code/static resources
    ]

    special_excludes = {
        "robots.txt", "sitemap.xml", "ads.txt", "favicon.ico"
    }

    path = urlparse(url).path.lower()
    basename = os.path.basename(path)

    if basename in special_excludes:
        return 1
    elif any(basename.endswith("." + ext) for ext in excluded_exts):
        return 1
    return 0

In [15]:
def make_retry_session(
    retries=3,
    backoff_factor=0.5,
    status_forcelist=(
        # Server-side errors to retry on
        500,  # Internal Server Error – generic backend failure
        502,  # Bad Gateway – invalid response from upstream server
        503,  # Service Unavailable – server temporarily overloaded or down
        504   # Gateway Timeout – upstream server didn’t respond in time
    ),
) -> requests.Session:
    """
    Creates a requests.Session with retry behavior on specific transient errors.

    Parameters
    ----------
    retries : int
        Number of total retry attempts for each request.
    backoff_factor : float
        Factor to apply for exponential backoff between retries.
    status_forcelist : tuple
        HTTP status codes that should trigger a retry:
            500: Internal Server Error
            502: Bad Gateway
            503: Service Unavailable
            504: Gateway Timeout

    Returns
    -------
    session : requests.Session
        A session object with retry logic configured for connection issues and 5xx errors.
    """
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=status_forcelist,
        allowed_methods=frozenset(['HEAD', 'GET']),  # Retry only on safe idempotent methods
        raise_on_status=False,
    )

    adapter = HTTPAdapter(max_retries=retry)
    session = requests.Session()
    session.mount("https://", adapter)
    session.mount("http://", adapter)
    return session

In [16]:
df = (
    pd.read_csv("../data/common_crawl_sample.csv")
    .drop_duplicates("url", ignore_index=True)
    .assign(
        excluded_ext=lambda df_: df_["url"].apply(is_excluded_url),
        text_html_content=np.nan,
    )
)
df.head(5)

Unnamed: 0,urlkey,timestamp,url,mime,mime-detected,status,digest,length,offset,filename,country,pattern,languages,encoding,redirect,truncated,excluded_ext,text_html_content
0,"af,gov,moj)/content/files/crpd.pdf",20240714034654,https://moj.gov.af/Content/files/CRPD.pdf,text/html,text/html,404,GY5DQ2BNNLQPCCOAYY3FTL43EN6DI5LO,8518,13208486,crawl-data/CC-MAIN-2024-30/segments/1720763514...,Afghanistan,*.gov.af,,,,,1,
1,"af,gov,mfa,islamabad)/introductory-meeting-of-...",20240723080327,https://islamabad.mfa.gov.af/introductory-meet...,text/html,text/html,200,4VOGNJXYP5M74XLAL7FZCLYDKLQQ77XU,19902,266134915,crawl-data/CC-MAIN-2024-30/segments/1720763518...,Afghanistan,*.gov.af,eng,UTF-8,,,0,
2,"af,gov,mfa)/en/category/breaking-news",20240721095153,https://mfa.gov.af/en/category/breaking-news/,text/html,text/html,301,3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ,736,13362462,crawl-data/CC-MAIN-2024-30/segments/1720763517...,Afghanistan,*.gov.af,,,https://mfa.gov.af/en/category/breaking-news,,0,
3,"af,gov,mudh)/dr/%d8%a8%d8%b1%d9%86%d8%a7%d9%85...",20240719083155,https://mudh.gov.af/dr/%D8%A8%D8%B1%D9%86%D8%A...,text/html,text/html,200,I5ZMS44OMMVMBHOCXUYUZO45BSQCR653,9659,371362517,crawl-data/CC-MAIN-2024-30/segments/1720763514...,Afghanistan,*.gov.af,"fas,eng,pus",UTF-8,,,0,
4,"af,gov,mail)/en/node/13864",20240724090703,https://mail.gov.af/en/node/13864,text/html,text/html,200,WWGQY3A4YWEINGJCFBJHGCW4ELBGXOTU,8188,322912422,crawl-data/CC-MAIN-2024-30/segments/1720763518...,Afghanistan,*.gov.af,"eng,fas",UTF-8,,,0,


In [7]:
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [None]:
savepath = "../data/check_cc_url_contenttype.csv"
df = pd.read_csv(savepath)
save_every = 100
session = make_retry_session()

for ix, row in tqdm(df.iterrows(), total=len(df), desc="Checking content type"):
    # skip if excluded via extension
    if row["excluded_ext"]==1:
        continue
    # skip if confirmed text/html
    if pd.notna(row["text_html_content"]):
        continue
        
    try:
        r = session.head(
            row["url"], 
            allow_redirects=True, 
            timeout=(3, 5),  # (connect timeout, read timeout)
            verify=False
        )
        content_type = r.headers.get("Content-Type", "").lower()
        df.at[ix, "text_html_content"] = int("text/html" in content_type)
    except requests.RequestException:
        df.at[ix, "text_html_content"] = 0
        
# Save every N rows
    if ix % save_every == 0:
        df.to_csv(savepath, index=False)        

Checking content type:   0%|          | 0/15663 [00:00<?, ?it/s]

In [21]:
len(df.query("text_html_content==1"))

328

In [22]:
len(df.query("text_html_content==0"))

151