# Introduction to Web Scraping with Selenium
This notebook may accompany the presentation. It introduces the basic steps of automating a browser to collect data.

In [77]:
# 1. Setup
# Ensure you have selenium installed: pip install selenium

from selenium import webdriver
from selenium.webdriver.common.by import By
import time

# Initialize the browser (Firefox)
# This will open a new browser window controlled by Python
# If you don't have Firefox setup, you can try webdriver.Chrome() if you have ChromeDriver
browser = webdriver.Firefox() 

In [78]:
# 2. Navigation
# Tell the browser to go to a specific URL
url = 'https://www.bbc.com/news/world/europe'
browser.get(url)
print(f"Successfully visited: {browser.title}")

Successfully visited: Europe | Latest News & Updates | BBC News


In [79]:
# 1. Switch to the frame first (crucial for cookie banners!)
# Based on common BBC structures:
browser.switch_to.frame(browser.find_element(By.CSS_SELECTOR, "iframe[id^='sp_message_iframe']"))

# 2. Use a simpler, more stable locator
# From your image_e61878.png, we see title='I agree'
button = browser.find_element(By.CSS_SELECTOR, "button[title='I agree']")
button.click()

## 3. Locating Elements
We need to find "hooks" in the HTML to specific content.
Common methods:
* `By.ID`
* `By.CLASS_NAME`
* `By.TAG_NAME`
* `By.CSS_SELECTOR`

In [80]:
# Example: Find all article containers
# We'll use TAG_NAME 'h2' as an example since headlines are usually h2.
# In the assignment, you'll need to inspect the specific page to find the right class or container.

headlines = browser.find_elements(By.TAG_NAME,'h2')

print(f"Found {len(headlines)} headlines.")

# Print the first 5 headlines found
for i, headline in enumerate(headlines[:5]):
    print(f"{i+1}: {headline.text}")

NoSuchWindowException: Message: Browsing context has been discarded
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8
WebDriverError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:202:5
NoSuchWindowError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:782:5
assert.that/<@chrome://remote/content/shared/webdriver/Assert.sys.mjs:581:13
assert.open@chrome://remote/content/shared/webdriver/Assert.sys.mjs:169:4
GeckoDriver.prototype.findElements@chrome://remote/content/marionette/driver.sys.mjs:2005:15
dispatch@chrome://remote/content/marionette/server.sys.mjs:318:40
execute@chrome://remote/content/marionette/server.sys.mjs:289:16
onPacket/<@chrome://remote/content/marionette/server.sys.mjs:262:20
onPacket@chrome://remote/content/marionette/server.sys.mjs:263:9
_onJSONObjectReady/<@chrome://remote/content/marionette/transport.sys.mjs:494:20


In [None]:
# 4. Extracting Data
# Once we have an element, we want its properties like .get_attribute('href')...

# Let's find links (anchor tags <a>) 
links = browser.find_elements(By.TAG_NAME, 'a')

print("Sample of extracted data:")
count = 0
for link in links:
    # ... more data ...
    href = link.get_attribute('href')
    # Filter for interesting links (e.g., containing 'news') to avoid menu links
    if href and '/news/' in href:
        print(f"URL: {href}")
        print("-" * 20)
        count += 1
        if count >= 3: break

NoSuchWindowException: Message: Browsing context has been discarded
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8
WebDriverError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:202:5
NoSuchWindowError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:782:5
assert.that/<@chrome://remote/content/shared/webdriver/Assert.sys.mjs:581:13
assert.open@chrome://remote/content/shared/webdriver/Assert.sys.mjs:169:4
GeckoDriver.prototype.findElements@chrome://remote/content/marionette/driver.sys.mjs:2005:15
dispatch@chrome://remote/content/marionette/server.sys.mjs:318:40
execute@chrome://remote/content/marionette/server.sys.mjs:289:16
onPacket/<@chrome://remote/content/marionette/server.sys.mjs:262:20
onPacket@chrome://remote/content/marionette/server.sys.mjs:263:9
_onJSONObjectReady/<@chrome://remote/content/marionette/transport.sys.mjs:494:20


In [None]:
# 5. Interaction (Clicking Buttons)
# Important for "Pagination" in the assignment (Clicking 'Next' or region buttons).
#Cookie clicker

button = browser.find_element(By.TAG_NAME , 'Europe')
button.click()

print("To click a button:")
print("1. Inspect the button to find its ID or Class")
print("2. Use browser.find_element(...) to select it")
print("3. Call .click() on that element")

NoSuchWindowException: Message: Browsing context has been discarded
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8
WebDriverError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:202:5
NoSuchWindowError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:782:5
assert.that/<@chrome://remote/content/shared/webdriver/Assert.sys.mjs:581:13
assert.open@chrome://remote/content/shared/webdriver/Assert.sys.mjs:169:4
GeckoDriver.prototype.findElement@chrome://remote/content/marionette/driver.sys.mjs:1906:15
dispatch@chrome://remote/content/marionette/server.sys.mjs:318:40
execute@chrome://remote/content/marionette/server.sys.mjs:289:16
onPacket/<@chrome://remote/content/marionette/server.sys.mjs:262:20
onPacket@chrome://remote/content/marionette/server.sys.mjs:263:9
_onJSONObjectReady/<@chrome://remote/content/marionette/transport.sys.mjs:494:20


In [None]:
# 6. Handling Delays
# When you click, the page needs time to load before you can scrape again.
# If you scrape too fast, you might get old data or errors.

print("Simulating a wait for page load...")
time.sleep(3) # Pause for 3 seconds
print("Starting to scrape...")

Simulating a wait for page load...
Starting to scrape...


In [82]:
# 7. Cleanup
# Close the browser when done to free up resources.
browser.quit()