# Web Scraping
# Day 22: 30DaysOfPython Challenge

### 💻 Exercises: Day 22

In [5]:
# Scrape the following website and store the data as json file(url = 'http://www.bu.edu/president/boston-university-facts-stats/').
import requests
import json
from bs4 import BeautifulSoup

url = 'http://www.bu.edu/president/boston-university-facts-stats/'
response = requests.get(url)

if response.status_code == 200:
    content = response.content
    soup = BeautifulSoup(content, 'html.parser')

    # Identifying the HTML elements containing the information we need
    # Example: Extracting paragraphs from the body
    paragraphs = soup.find_all('p')

    # Creating a dictionary to store the extracted data
    data = {'paragraphs': [paragraph.get_text() for paragraph in paragraphs]}

    # Converting the data to JSON
    json_data = json.dumps(data, indent=2)

    # Saving the JSON data to a file
    with open('scraped_data.json', 'w') as json_file:
        json_file.write(json_data)

    print('Data has been scraped and stored as "scraped_data.json".')
else:
    print('Failed to retrieve the webpage. Status code:', response.status_code)

Data has been scraped and stored as "scraped_data.json".


In [6]:
# Extract the table in this url (https://archive.ics.uci.edu/ml/datasets.php) and change it to a json file

table_url = 'https://archive.ics.uci.edu/ml/datasets.php'
table_response = requests.get(table_url)

if table_response.status_code == 200:
    table_soup = BeautifulSoup(table_response.content, 'html.parser')

    # Finding the table with cellpadding attribute equal to 3
    target_table = table_soup.find('table', {'cellpadding': '3'})

    if target_table:
        # Extract the data from the table
        table_data = []
        for row in target_table.find_all('tr'):
            row_data = [cell.get_text(strip=True) for cell in row.find_all(['th', 'td'])]
            table_data.append(row_data)

        # Converting the data to JSON
        json_data = json.dumps(table_data, indent=2)

        # Saving the JSON data to a file
        with open('table_data.json', 'w') as json_file:
            json_file.write(json_data)

        print('Table data has been extracted and stored as "table_data.json".')
    else:
        print('No table found on the webpage.')
else:
    print('Failed to retrieve the webpage. Status code:', table_response.status_code)

Failed to retrieve the webpage. Status code: 404


In [7]:
# Scrape the presidents table and store the data as json(https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States).
# The table is not very structured and the scrapping may take very long time.

url = 'https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States'
response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the table containing the list of presidents
    presidents_table = soup.find('table', {'class': 'wikitable'})

    if presidents_table:
        # Extract table data
        table_data = []
        headers = [header.get_text(strip=True) for header in presidents_table.find_all('th')]
        table_data.append(headers)

        for row in presidents_table.find_all('tr')[1:]:
            row_data = [cell.get_text(strip=True) for cell in row.find_all(['th', 'td'])]
            table_data.append(row_data)

        # Converting the data to JSON
        json_data = json.dumps(table_data, indent=2)

        # Saving the JSON data to a file
        with open('presidents_data.json', 'w') as json_file:
            json_file.write(json_data)

        print('Presidents table data has been extracted and stored as "presidents_data.json".')
    else:
        print('No presidents table found on the webpage.')
else:
    print('Failed to retrieve the webpage. Status code:', response.status_code)

Presidents table data has been extracted and stored as "presidents_data.json".
