In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time
import random
import os
from urllib.parse import urljoin
from webdriver_manager.chrome import ChromeDriverManager

In [None]:
def setup_driver(headless: bool = False) -> webdriver.Chrome:
 
    options = webdriver.ChromeOptions()
    
    # Example flags that help scraping stability
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")

    if headless:
        options.add_argument("--headless=new")

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)
    driver.set_page_load_timeout(30)
    return driver


driver = setup_driver(headless=False)
wait = WebDriverWait(driver, 15)

In [None]:
BASE_URL = "https://example-directory.com"
TEST_DETAIL_URL = f"{BASE_URL}/members/12345"


def is_access_allowed(url: str) -> str:
    driver.get(url)

    try:
        WebDriverWait(driver, 6).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )
    except TimeoutException:
        pass

    html = driver.page_source.lower()

    if "access has been restricted" in html or "exceeding the limit" in html:
        return "RESTRICTED"

    if "members only" in html or "please login" in html:
        return "LOGIN_REQUIRED"

    return "OK"


print("Test access:", is_access_allowed(TEST_DETAIL_URL))

In [None]:
# Example input: members_raw.csv with at least a 'url' column
INPUT_MEMBERS_CSV = "members_raw.csv"

df_links = pd.read_csv(INPUT_MEMBERS_CSV)

# Normalize URLs
df_links["url"] = (
    df_links["url"]
    .astype(str)
    .str.strip()
    .str.split("#").str[0]
    .str.rstrip("/")
)

URLS = (
    df_links["url"]
    .dropna()
    .astype(str)
    .str.strip()
    .loc[lambda s: s.str.startswith("http")]
    .tolist()
)

print("Total URL count:", len(URLS))

In [None]:
def scrape_office_contacts(url: str):

    driver.get(url)

    # Wait for body to load
    try:
        WebDriverWait(driver, 6).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )
    except TimeoutException:
        return []

    html = driver.page_source
    html_lower = html.lower()

    # Access restriction checks
    if "access has been restricted" in html_lower or "exceeding the limit" in html_lower:
        print("ACCESS RESTRICTED for:", url)
        return []

    if "members only" in html_lower or "please login" in html_lower:
        print("MEMBERS ONLY page:", url)
        return []

    soup = BeautifulSoup(html, "lxml")

    contacts = []

    
    contact_cards = soup.select("div.contact-card")
    if not contact_cards:
        # Fallback: maybe the info lives in a definition list
        contact_cards = soup.select("div.office-contact")

    for card in contact_cards:
        contact = {}
       
        for row in card.select("dl"):
            label_el = row.find("dt")
            value_el = row.find("dd")

            if not label_el or not value_el:
                continue

            label = label_el.get_text(" ", strip=True)
            value = value_el.get_text(" ", strip=True)

            if "members only" in value.lower():
                value = None

            if label:
                contact[label] = value

        if contact:
            contacts.append(contact)

    return contacts

In [None]:
OUT_WIDE_CSV = "office_contacts_wide.csv"
OUT_LONG_CSV = "office_contacts_long.csv"
LOG_CSV = "scrape_log.csv"

MAX_CONTACT = 8
GLOBAL_LABELS = ["Name", "Title", "Direct Line", "Email", "Mobile", "Fax"]


def normalize_label(label: str) -> str:
    return label.replace(" ", "")

In [None]:
def append_log(i: int, url: str, status: str, message: str = ""):
    row = {"i": i, "url": url, "status": status, "message": message}
    df_row = pd.DataFrame([row])

    if not os.path.exists(LOG_CSV):
        df_row.to_csv(LOG_CSV, index=False)
    else:
        df_row.to_csv(LOG_CSV, index=False, mode="a", header=False)


def get_resume_index() -> int:
    if not os.path.exists(LOG_CSV):
        return 0

    df_log = pd.read_csv(LOG_CSV)
    if df_log.empty:
        return 0

    success_rows = df_log[df_log["status"] == "SUCCESS"]
    if success_rows.empty:
        return 0

    return int(success_rows["i"].max())


resume_from = get_resume_index()
print("Resuming from index:", resume_from)

In [None]:
records = []
failed_urls = []

# Determine if we need to write header
write_header = not os.path.exists(OUT_WIDE_CSV)
processed = 0

for i, url in enumerate(URLS, 1):
    if i <= resume_from:
        continue

    try:
        contacts = scrape_office_contacts(url)

        # Build a wide-format row: url + c1_Name, c1_Email, ..., cN_Fax
        wide_row = {"url": url}
        for idx, contact in enumerate(contacts[:MAX_CONTACT], start=1):
            for label in GLOBAL_LABELS:
                key = f"c{idx}_{normalize_label(label)}"
                value = contact.get(label)
                wide_row[key] = value

        records.append(wide_row)

        df_chunk = pd.DataFrame(records)
        df_chunk.to_csv(OUT_WIDE_CSV, index=False, mode="a", header=write_header)

        write_header = False
        records.clear()

        append_log(i, url, "SUCCESS", "")
        processed += 1

    except Exception as e:
        failed_urls.append({"url": url, "error": str(e)})
        append_log(i, url, "FAILED", str(e))
        print("FAILED:", url, "->", e)

    # Simple rate limiting
    time.sleep(random.uniform(3.0, 6.0))

    if i % 10 == 0:
        print(f"{i}/{len(URLS)} processed in this run:", processed)
        time.sleep(random.uniform(20, 40))

print("Successful URLs in this run:", processed)
print("Failed URLs in this run:", len(failed_urls))

In [None]:
# Load wide table
df_wide = pd.read_csv(OUT_WIDE_CSV)

# Optional: merge company_name from the original links file
df_comp_unique = (
    df_links[["url", "company_name"]]
    .drop_duplicates()
    .reset_index(drop=True)
)

# Build long table: one row per contact field
value_cols = [c for c in df_wide.columns if c != "url"]

df_long = (
    df_wide
    .set_index("url")[value_cols]
    .stack()
    .reset_index(name="value")
)

df_long.columns = ["url", "contact_field", "value"]

# Extract contact index and field name from columns like "c1_Name"
df_long["contact_index"] = (
    df_long["contact_field"]
    .str.extract(r"c(\d+)_")[0]
    .astype(float)
)

df_long["field_name"] = (
    df_long["contact_field"]
    .str.replace(r"c\d+_", "", regex=True)
)

df_long = df_long.dropna(subset=["value"]).reset_index(drop=True)

# Merge company_name
df_long_with_name = df_long.merge(
    df_comp_unique[["url", "company_name"]],
    on="url",
    how="left"
)

# Reorder columns
cols = df_long_with_name.columns.tolist()
new_cols = (
    ["company_name"] +
    [c for c in cols if c not in ["company_name", "url"]] +
    ["url"]
)

df_long_reordered = df_long_with_name[new_cols]

df_long_reordered.to_csv(OUT_LONG_CSV, index=False)
df_long_reordered.head()