In [None]:
%pip install beautifulsoup4 requests

In [None]:
import requests
import csv
import os
from bs4 import BeautifulSoup

# URL of the website to scrape
url = 'https://en.wikipedia.org/wiki/European_emission_standards#Toxic_emission:_stages_and_legal_framework'

# Send a GET request to the URL
response = requests.get(url)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

# Find all the tables in the HTML content
tables = soup.find_all('table')

# Loop through each table and create a CSV file for it
for i, table in enumerate(tables):
    # Find the previous H3 heading for the table
    prev_h3 = table.find_previous('h3')

    # Find all the rows in the table
    rows = table.find_all('tr')
    
    # Skip tables with less than 2 rows or without previous H3 heading
    if len(rows) < 2 or prev_h3 is None:
        continue
    
    # Use the text of the previous H3 heading as the filename
    filename = prev_h3.get_text(strip=True).lower().replace('[edit]', '').replace(' ', '_')
    
    # Remove everything after the first '(' in the filename
    filename = filename.split('(', 1)[0]

    # Remove everything before the first '_' in the filename
    filename = filename.split('_', 1)[-1]

    # Add the ".csv" extension to the filename
    filename = f'{filename}.csv'

    # Find the previous paragraph for the table
    prev_par = table.find_previous('p').get_text(strip=True)

    # Check if file already exists
    if os.path.exists(filename) or ('Class II)' not in prev_par and 'light_commercial_vehicles' in filename):
        continue
    
    # Open the CSV file in write mode with 'utf-8' encoding
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        # Create a CSV writer object
        writer = csv.writer(csvfile)
        
        # Loop through each row and write it to the CSV file
        for i, row in enumerate(rows):
            # Find all the cells in the row
            cells = row.find_all(['th', 'td'])

            first_cell = cells[0].get_text(strip=True)
            
            # Skip if the first cell contains the text 'Euro', it's not the first row and has more then 1 cell
            # Or if the first cell contains the text '^' in the first 2 characters
            if ('Euro' not in first_cell and i != 0 and len(cells) > 1) or '^' in first_cell[:2]:
                continue
            
            # Extract the text from each cell and write it to the CSV file
            writer.writerow([cell.get_text(strip=True) for cell in cells])