# Web Scraper for Flood List

The following script is for attempting and learning to web scrape using Beautiful Soup for a website named FloodList. This is part of my solo self learning project on NLP. 72/80 articles from the first 5 pages were scraped into a a csv file, the 'failed' ones where later scraped into another csv file. 

In [20]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random

# Base URLs
base_url = "https://floodlist.com/europe/united-kingdom"
page_url = "https://floodlist.com/europe/united-kingdom/page/"

# User-Agent rotation for mimicking browser behavior
user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/89.0',
]

# Retry mechanism for fetching pages
def fetch_page_with_retry(url, headers, retries=3):
    for attempt in range(retries):
        try:
            response = requests.get(url, headers=headers, timeout=10)
            if response.status_code == 200:
                return response
            else:
                print(f"Attempt {attempt + 1} failed: {url} (Status: {response.status_code})")
        except requests.exceptions.RequestException as e:
            print(f"Attempt {attempt + 1} failed: {url} ({e})")
        time.sleep(5)  # Delay before retrying
    return None

# Initialize variables
article_links = []
articles_data = []

# Scrape the first page separately
print("Scraping the first page...")
headers = {'User-Agent': random.choice(user_agents)}
response = fetch_page_with_retry(base_url, headers)
if response:
    soup = BeautifulSoup(response.content, 'html.parser')
    links = soup.find_all('div', class_='more-link')
    for link in links:
        article_url = link.find('a')['href']
        if article_url:
            article_links.append(article_url)

# Scrape subsequent pages
for page in range(2, 6):  # Adjust range as needed     for page in range(2, 18) because 17 pages from page 2
    print(f"Scraping page {page}...")
    headers = {'User-Agent': random.choice(user_agents)}
    response = fetch_page_with_retry(f"{page_url}{page}", headers)
    if response:
        soup = BeautifulSoup(response.content, 'html.parser')
        links = soup.find_all('div', class_='more-link')
        for link in links:
            article_url = link.find('a')['href']
            if article_url:
                article_links.append(article_url)
    time.sleep(25)  # Delay between page requests

print(f"Total article links found: {len(article_links)}")

# Scrape all articles from collected links
for i, link in enumerate(article_links):
    print(f"Scraping article {i + 1}/{len(article_links)}: {link}")
    try:
        headers = {'User-Agent': random.choice(user_agents)}
        article_response = fetch_page_with_retry(link, headers)
        if article_response:
            article_soup = BeautifulSoup(article_response.content, 'html.parser')

            # Extract title
            title = article_soup.find('h1').text.strip() if article_soup.find('h1') else "No Title"

            # Extract date
            date_tag = article_soup.find('time')
            date = date_tag.text.strip() if date_tag else "No Date"

            # Extract full text
            content_div = article_soup.find('div', class_='entry-content')
            if content_div:
                paragraphs = content_div.find_all('p')
                full_text = " ".join([p.text.strip() for p in paragraphs])
            else:
                full_text = "No Content"

            # Append data
            articles_data.append({
                'Title': title,
                'Date': date,
                'Full Text': full_text,
                'Link': link
            })

        else:
            print(f"Failed to fetch article: {link}")

        # Delay between article requests
        time.sleep(10)  # Add 3-second delay between articles

    except Exception as e:
        print(f"Error scraping article: {e}")

# Save data to CSV
if articles_data:
    df = pd.DataFrame(articles_data)
    df.to_csv('uk_flood_articles_final.csv', index=False)
    print("Data saved to 'uk_flood_articles_final.csv'")
else:
    print("No data scraped.")


Scraping the first page...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Total article links found: 80
Scraping article 1/80: https://floodlist.com/europe/united-kingdom/storm-henk-floods-january-2024
Attempt 1 failed: https://floodlist.com/europe/united-kingdom/storm-henk-floods-january-2024 (Status: 502)
Attempt 2 failed: https://floodlist.com/europe/united-kingdom/storm-henk-floods-january-2024 (Status: 502)
Attempt 3 failed: https://floodlist.com/europe/united-kingdom/storm-henk-floods-january-2024 (Status: 502)
Failed to fetch article: https://floodlist.com/europe/united-kingdom/storm-henk-floods-january-2024
Scraping article 2/80: https://floodlist.com/europe/floods-england-scotland-storm-babet-october-2023
Scraping article 3/80: https://floodlist.com/europe/united-kingdom/floods-devon-somerset-may-2023
Scraping article 4/80: https://floodlist.com/europe/united-kingdom/intense-rain-increase-climate-change
Scraping article 5/80: https://floodlist.com/

In [34]:
## Scraping the remaining articles which failed to get scraped

# List of article URLs
article_urls = [
    "https://floodlist.com/europe/united-kingdom/storm-henk-floods-january-2024",
    "https://floodlist.com/europe/united-kingdom/flood-risk-somerset-levels-january-2023",
    "https://floodlist.com/protection/trees-woodlands-boost-uk-flood-protection",
    "https://floodlist.com/protection/englands-environment-agency-natural-flood-management",
    "https://floodlist.com/europe/united-kingdom/climate-warmer-wetter-britain",
    "https://floodlist.com/asia/japans-earthquake-flood-protection",
    "https://floodlist.com/europe/united-kingdom/uk-flash-floods-cornwall-december-2019",
    "https://floodlist.com/europe/united-kingdom/england-flash-floods-24-september-2019",
    "https://floodlist.com/protection/brunel-university-london-launches-centre-for-flood-risk-and-resilience"
]

# User-Agent to mimic a browser
headers = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Initialize a list to store article data
articles_data = []

# Function to scrape a single article
def scrape_article(url):
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')

            # Extract title
            title = soup.find('h1').text.strip() if soup.find('h1') else "No Title"

            # Extract date
            date_tag = soup.find('time')
            date = date_tag.text.strip() if date_tag else "No Date"

            # Extract full text
            content_div = soup.find('div', class_='entry-content')
            if content_div:
                paragraphs = content_div.find_all('p')
                full_text = " ".join([p.text.strip() for p in paragraphs])
            else:
                full_text = "No Content"

            # Return the article data
            return {
                'Title': title,
                'Date': date,
                'Full Text': full_text,
                'Link': url
            }
        else:
            print(f"Failed to fetch article: {url} (Status Code: {response.status_code})")
            return None

    except Exception as e:
        print(f"Error scraping article: {url} ({e})")
        return None

# Iterate over the list of URLs and scrape each one
for i, url in enumerate(article_urls):
    print(f"Scraping article {i + 1}/{len(article_urls)}: {url}")
    article_data = scrape_article(url)
    if article_data:
        articles_data.append(article_data)
    time.sleep(25)  # Add a delay between requests to avoid overwhelming the server

# Save the scraped data to a CSV file
if articles_data:
    df = pd.DataFrame(articles_data)
    df.to_csv('failed_articles_scraped.csv', index=False)
    print("Scraping complete. Data saved to 'failed_articles_scraped.csv'.")
else:
    print("No articles were successfully scraped.")


Scraping article 1/9: https://floodlist.com/europe/united-kingdom/storm-henk-floods-january-2024
Scraping article 2/9: https://floodlist.com/europe/united-kingdom/flood-risk-somerset-levels-january-2023
Scraping article 3/9: https://floodlist.com/protection/trees-woodlands-boost-uk-flood-protection
Scraping article 4/9: https://floodlist.com/protection/englands-environment-agency-natural-flood-management
Scraping article 5/9: https://floodlist.com/europe/united-kingdom/climate-warmer-wetter-britain
Scraping article 6/9: https://floodlist.com/asia/japans-earthquake-flood-protection
Scraping article 7/9: https://floodlist.com/europe/united-kingdom/uk-flash-floods-cornwall-december-2019
Scraping article 8/9: https://floodlist.com/europe/united-kingdom/england-flash-floods-24-september-2019
Scraping article 9/9: https://floodlist.com/protection/brunel-university-london-launches-centre-for-flood-risk-and-resilience
Scraping complete. Data saved to 'failed_articles_scraped.csv'.
