In [None]:
# colab has an older version of beautifulsoup by default
# here we upgrade it
# if you are working on your own computer, you can probably comment this step out and skip it
!pip install --upgrade beautifulsoup4

Collecting beautifulsoup4
  Downloading beautifulsoup4-4.12.2-py3-none-any.whl (142 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/143.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.0/143.0 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: beautifulsoup4
  Attempting uninstall: beautifulsoup4
    Found existing installation: beautifulsoup4 4.11.2
    Uninstalling beautifulsoup4-4.11.2:
      Successfully uninstalled beautifulsoup4-4.11.2
Successfully installed beautifulsoup4-4.12.2


In [None]:
# 1. import libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

In [None]:
# 2. find url and store it in a variable
url = "https://en.wikipedia.org/wiki/Berlin"

In [None]:
# 3. download html with a get request
response = requests.get(url)
response.status_code # 200 status code means OK!

200

In [None]:
# 4.1. parse html (create the 'soup')
soup = BeautifulSoup(response.content, "html.parser")
# 4.2. check that the html code looks like it should
# soup

In [None]:
# 5. retrieve/extract the desired info (here, you'll paste the "Selector" you copied before to get the element that belongs to the top movie)

# let's first try to get the name of the city
# by copying the selector we can see that it has the id firstHeading (it also has a class by the same name!)
soup.select("#firstHeading")

[<h1 class="firstHeading mw-first-heading" id="firstHeading"><span class="mw-page-title-main">Berlin</span></h1>]

In [None]:
soup.select("#firstHeading")[0].get_text()

'Berlin'

In [None]:
# Let's use this class, infobox-data, to target the information country
soup.select(".infobox-data")[0].get_text()

'Germany'

In [None]:
#soup.select(".infobox-data")[0].get_text()

Now we just carry on exploring the html, finding classes, ids, and selectors to target the information we need. Hopefully these classes and selectors will be universal across all cities on wikipedia, but it is likely that they will change in a few places, and we will have to try to make our code robust to this

In [None]:
# Import necessary modules
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Function to extract website data from the soup object
def get_website(soup):
  # Find the element with class 'infobox-label' and text 'Website'
  website_elem = soup.find(class_="infobox-label", string="Website")
  if website_elem:
    # If found, return the next sibling element with class 'infobox-data'
    next_elem = website_elem.find_next(class_='infobox-data')
    return next_elem.text
  else:
      # If not found, return None
      return None

# Function to extract population data from the soup object
def get_population(soup):
    # Find the table header element containing the text 'Population'
    population_elem = soup.select_one('th.infobox-header:-soup-contains("Population")')
    # If found, find the next sibling and extract the first numerical data
    return population_elem.parent.find_next_sibling().find(text=re.compile(r'\d+')) if population_elem else None # note the different syntax for the if/else clause here

# Function to clean the data in the DataFrame
def clean_data(df):
    # Remove unnecessary characters from 'latitude' and 'longitude' columns
    df['latitude'] = df['latitude'].str.split('″').str[0].str.replace('°', '.', regex=False).str.replace('′', '', regex=False)
    df['longitude'] = df['longitude'].str.split('″').str[0].str.replace('°', '.', regex=False).str.replace('′', '', regex=False)

# Function to extract city info from its Wikipedia page
def get_city_info(city):
    # Construct the URL
    url = f'https://en.wikipedia.org/wiki/{city}'
    # Send a GET request
    r = requests.get(url)
    # Parse the response content with BeautifulSoup
    soup = BeautifulSoup(r.content, 'html.parser')

    try:
        # Construct a dictionary with necessary details
        response_dict = {
            'city': soup.select_one(".firstHeading").get_text(),
            'country': soup.select_one(".infobox-data").get_text(),
            'latitude': soup.select_one(".latitude").get_text(),
            'longitude': soup.select_one(".longitude").get_text(),
            'website': get_website(soup),
            'population': get_population(soup)
        }
    except AttributeError:
        # If any data is missing, print an error message and return None
        print(f'Failed to get data for {city}')
        return None

    return response_dict

# Function to scrape data for a list of cities and return a DataFrame
def recreate_wiki(cities):
    # Get info for each city
    city_data = [get_city_info(city) for city in cities]
    # Convert the list of dictionaries to a DataFrame
    cities_df = pd.DataFrame(city_data)
    # Clean the data
    clean_data(cities_df)
    # Return the DataFrame
    return cities_df

In [None]:
list_of_cities = ['Berlin', 'Hamburg', 'Bremen', 'Munich', 'Stuttgart']
recreate_wiki(list_of_cities)

  return population_elem.parent.find_next_sibling().find(text=re.compile(r'\d+')) if population_elem else None # note the different syntax for the if/else clause here


Unnamed: 0,city,country,latitude,longitude,website,population
0,Berlin,Germany,52.3112,13.2418,berlin.de,3677472
1,Hamburg,Germany,53.33N,10.00E,hamburg.com,1906411
2,Bremen,Germany,53.5N,8.48E,Bremen online,563290
3,Munich,Germany,48.0815,11.3430,stadt.muenchen.de,1487708
4,Stuttgart,Germany,48.4639,09.1048,www.stuttgart-tourist.de,626275
