In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager  
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

In [None]:
def init_driver(url):
    options = webdriver.ChromeOptions()
    driver = webdriver.Chrome(service= Service(ChromeDriverManager().install()), options=options)
    driver.get(url)
    return driver


def close_driver(driver):
    driver.quit()


def handle_cookie_and_filters(driver):
    try:
        # Wait for the Sourcepoint iframe to appear
        iframe = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located(
                (By.CSS_SELECTOR, "iframe[src*='https://cdn.privacy-mgmt.com']")
            )
        )
        driver.switch_to.frame(iframe)
        print("Switched into cookie consent iframe.")

        # Wait for the "Accept all" button to become clickable
        cookie_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable(
                (By.XPATH, "//button[@title='Accept all' or @aria-label='Accept all']")
            )
        )

        # Click via JavaScript (safer than .click())
        driver.execute_script("arguments[0].click();", cookie_button)
        print("Cookie consent accepted.")

        # Switch back to main content
        driver.switch_to.default_content()

    except TimeoutException:
        print("No cookie prompt found or timed out.")


In [None]:
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
import time, random, csv
from datetime import datetime

def scrape_articles(driver, output_csv="articles.csv"):
    all_data = []
    cutoff_date = datetime(2025, 10, 20)
    last_height = driver.execute_script("return document.body.scrollHeight")

    # Initialize CSV file with header
    with open(output_csv, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["Title", "Content", "Date"])
        writer.writeheader()

    try:
        while True:
            # Scroll to bottom slowly to trigger lazy loading
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(random.uniform(3, 5))

            # Try clicking "Load More" if visible
            try:
                load_more = driver.find_element(By.CSS_SELECTOR, ".ui-liveblog-button--load-more")
                if load_more.is_displayed():
                    driver.execute_script("arguments[0].scrollIntoView(true);", load_more)
                    time.sleep(1)
                    load_more.click()
                    print("Clicked 'Load More' button...")
                    time.sleep(random.uniform(2, 4))
            except (NoSuchElementException, ElementClickInterceptedException):
                pass  # no button visible — continue scrolling

            # Collect all visible article elements
            articles = driver.find_elements(By.XPATH, "//div[@role='article']")

            for article in articles[len(all_data):]:  # only process new ones
                try:
                    # Get date
                    time_elem = article.find_element(By.CSS_SELECTOR, ".ncpost-header time")
                    date_str = time_elem.get_attribute("datetime")[:10]
                    date_obj = datetime.strptime(date_str, "%Y-%m-%d")

                    # Title
                    title = article.find_element(By.CLASS_NAME, "ncpost-title").text.strip()

                    # Content
                    paragraphs = article.find_elements(By.CSS_SELECTOR, ".ncpost-content p")
                    content = " ".join([p.text.strip() for p in paragraphs])

                    data = {"Title": title, "Content": content, "Date": date_str}
                    all_data.append(data)

                    # Save each article immediately to CSV
                    with open(output_csv, "a", newline="", encoding="utf-8") as f:
                        writer = csv.DictWriter(f, fieldnames=["Title", "Content", "Date"])
                        writer.writerow(data)


                    # Stop when reaching older content
                    if date_obj < cutoff_date:
                        print(f"Reached cutoff date ({date_str}). Stopping.")
                        return all_data

                except Exception as e:
                    print(f"Skipping article due to error: {e}")
                    continue

            # Check if page height stopped changing (no more content)
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                print("No more new content loaded — ending scroll.")
                break
            last_height = new_height

    except Exception as e:
        print(f"Unexpected error occurred: {e}")
    finally:
        print(f"Scraping finished. {len(all_data)} articles saved.")
        return all_data

In [None]:
BASE_URL = 'https://www.skysports.com/football/live-blog/11095/12476234/transfer-centre-live-football-transfer-news-updates-and-rumours'

def main(url):
    driver = init_driver(url)
    driver.maximize_window()
    try:
        handle_cookie_and_filters(driver)
        article = scrape_articles(driver)
        # set_filters(driver)
        # all_cars = scrape_cars(driver)
   
    finally:
        close_driver(driver)  # always close even if error
    # return pd.DataFrame(all_cars)
    return article

article = main(BASE_URL)
# cars.head()

Switched into cookie consent iframe.
Cookie consent accepted.
Clicked 'Load More' button...
Clicked 'Load More' button...
Reached cutoff date (2025-10-19). Stopping.
Scraping finished. 50 articles saved.


In [None]:
print(f"Total of {len(article)} transfer news scrapped.")

Total of 5508 transfer news scrapped.
