### Webscraping from city's wiki page
Now, I will webscrape wiki pages to obtain information on the following three columns:
- population
- average elevation (in m)
- country

#### Import packages

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re

#### Read cities_df

In [2]:
cities_df_cleaned = pd.read_csv("data/cities_df_cleaned.csv")
cities = cities_df_cleaned["City"].unique()
cities

array(['Berlin', 'Hamburg', 'Munich', 'Cologne', 'Paris', 'Nice', 'Rome',
       'Milan', 'Warsaw', 'Barcelona', 'Madrid', 'Sevilla', 'London',
       'Birmingham', 'Manchester', 'Leeds', 'Newcastle', 'The Hague',
       'Amsterdam', 'Rotterdam'], dtype=object)

#### Webscrape the wiki pages for selected cities

In [3]:
urls = []
responses = []
soups = []

cities_webscraping = ['Berlin', 'Hamburg', 'Munich', 'Cologne', 'Paris', 'Nice', 'Rome', 'Milan', 'Warsaw', 'Barcelona', 'Madrid', 'Seville', 'London','Birmingham', 'Manchester', 'Leeds', 'City_of_Newcastle', 'The_Hague', 'Amsterdam', 'Rotterdam']

def cities_dataframe(cities):
    cities_data = []
    for city in cities:
        city_data = {}
        try:
        # city
            city_data["City"] = city
            url = f"https://www.wikipedia.org/wiki/{city}"
            response = requests.get(url)
            city_soup = BeautifulSoup(response.content, 'html.parser')
            
        # Country
            table = city_soup.find_all("table", class_="infobox ib-settlement vcard")[0]
            country = table.find(string="Country").find_next("td").get_text() if table.find(string="Country") else None
            city_data["Country"]=country

        # Elevation
            elevation_pattern = re.compile(r'elevation\s*(\()?', re.IGNORECASE)
            elevation_tag = table.find(string=elevation_pattern)
            elevation = elevation_tag.find_next("td").get_text() if elevation_tag else None
            elevation = str(elevation).split("m")[0]
            elevation = str(elevation).split("(")[1] if "(" in str(elevation) else str(elevation)
            city_data["Elevation (in m)"]=elevation

        # Population
            population = table.find(string="Population").find_next("td").get_text() if table.find(string="Population") else None
            population = int(population.replace(",", "")) if isinstance(population, str) and population.isdigit() else population
            population = str(population).split("[")[0] if "[" in str(population) else str(population)
            city_data["Population"]=population

        # GDP
            # gdp_pattern = re.compile(r'gdp\s*(\()?', re.IGNORECASE)
            # gdp_tag = table.find(string=gdp_pattern)
            # gdp = gdp_tag.find_next("td").get_text() if gdp_tag else None
            # if gdp and "$" in gdp:
            #     gdp = gdp.split("$")[1]
            # if gdp and "[" in gdp:
            #     gdp = gdp.split("[")[0]
            # if gdp and "(" in gdp:
            #     gdp = gdp.split("(")[0]
            # if gdp and "€" in gdp:
            #     gdp = gdp.split("€")[1]
            #     gdp = gdp.replace(" ", "")
            #     gdp = gdp.split("billion")[0]
            #     gdp = float(gdp) * 1.08
            #     gdp = str(gdp).split("billion")[0]
            # city_data["GDP (in billions $)"]=gdp

        # Latitudes and Longitudes
            #coordinates = table.find_all("tr", class_="mergedbottomrow")
            #if coordinates:
                #latitude = coordinates[0].find_all("td")[0].find("span", class_="latitude").get_text(strip=True) if coordinates[0].find("span", class_="latitude") else None
                #longitude = coordinates[0].find_all("td")[0].find("span", class_="longitude").get_text(strip=True) if coordinates[0].find("span", class_="longitude") else None
                #city_data["Latitude"]=latitude
                #city_data["Longitude"]=longitude
            
            # append this city's data to the cities list
            cities_data.append(city_data)
        except IndexError:
            print(f"Error processing city: {city}")
    return pd.DataFrame(cities_data)

df_webscraping = cities_dataframe(cities_webscraping).sort_values(by="City", ascending=True)
df_webscraping.head()

Error processing city: Leeds
Error processing city: City_of_Newcastle


Unnamed: 0,City,Country,Elevation (in m),Population
16,Amsterdam,Netherlands,−2,921402
9,Barcelona,Spain,12,1620343
0,Berlin,Germany,34,3576873
13,Birmingham,England,140,1142494
3,Cologne,Germany,37,1073096


#### Add missing rows
As above code does not generalize to webscrape information of two English cities (Leeds and Newcastle), I will manually enter this information to the dataframe. 

In [4]:
Leeds_row = {'City': 'Leeds', 'Elevation (in m)': "10", "Population": "536280", "Country": "England"}
df_webscraping.loc[len(df_webscraping)] = Leeds_row

Newcastle_row = {'City': 'Newcastle', 'Elevation (in m)': "30", "Population": "168873", "Country": "England"}
df_webscraping.loc[len(df_webscraping)] = Newcastle_row

#### Clean data
##### Clean the population column
The population column of cities of Rome and Warsaw returned NaNs. I will replace them manually.

In [5]:
# First fill in the missing info
df_webscraping.loc[df_webscraping["City"]=="Rome", "Population"] = "2,860,009"
df_webscraping.loc[df_webscraping["City"]=="Warsaw", "Population"] = "1,863,056"

# Replace "," with empty and convert the number to numeric
df_webscraping["Population"] = pd.to_numeric(df_webscraping["Population"].str.replace(',', ''), errors="coerce")
df_webscraping

Unnamed: 0,City,Country,Elevation (in m),Population
16,Amsterdam,Netherlands,−2,921402
9,Barcelona,Spain,12,1620343
0,Berlin,Germany,34,3576873
13,Birmingham,England,140,1142494
3,Cologne,Germany,37,1073096
1,Hamburg,Germany,,1945532
12,London,England,11,8799800
10,Madrid,Spain,650,3223334
14,Manchester,England,38,549853
7,Milan,Italy,120,1371498


##### Clean the City column

In [6]:
# First fill in the missing info
df_webscraping.loc[df_webscraping["City"]=="Rome", "Country"] = "Italy"

# Rename city names
df_webscraping["City"] = df_webscraping["City"].replace({"The_Hague": "The Hague", 'Seville': 'Sevilla'})

##### Clean the Elevation column

In [7]:
# df_webscraping["Elevation (in m)"]=df_webscraping["Elevation (in m)"].replace("None", np.NaN)
df_webscraping.loc[df_webscraping["City"]=="Hamburg", "Elevation (in m)"] = "23"
df_webscraping.loc[df_webscraping["City"]=="Nice", "Elevation (in m)"] = "10"
df_webscraping.loc[df_webscraping["City"]=="Paris", "Elevation (in m)"] = "35"
df_webscraping.loc[df_webscraping["City"]=="Warsaw", "Elevation (in m)"] = "100"
df_webscraping.loc[df_webscraping["City"]=="Rotterdam", "Elevation (in m)"] = "2"
df_webscraping["Elevation (in m)"] = df_webscraping["Elevation (in m)"].str.replace('\xa0', '')
df_webscraping["Elevation (in m)"] = df_webscraping["Elevation (in m)"].replace("−", "-", regex=True)
df_webscraping["Elevation (in m)"] = pd.to_numeric(df_webscraping["Elevation (in m)"], downcast="float", errors="coerce")
df_webscraping

Unnamed: 0,City,Country,Elevation (in m),Population
16,Amsterdam,Netherlands,-2.0,921402
9,Barcelona,Spain,12.0,1620343
0,Berlin,Germany,34.0,3576873
13,Birmingham,England,140.0,1142494
3,Cologne,Germany,37.0,1073096
1,Hamburg,Germany,23.0,1945532
12,London,England,11.0,8799800
10,Madrid,Spain,650.0,3223334
14,Manchester,England,38.0,549853
7,Milan,Italy,120.0,1371498


#### Merge with cities_df

In [8]:
df_webscraping_cleaned = df_webscraping.copy()
df_cities_ws_cleaned = cities_df_cleaned.merge(df_webscraping_cleaned, how="left", on="City")
df_cities_ws_cleaned

Unnamed: 0,City,country_2c,latitude,longitude,is_capital,Country,Elevation (in m),Population
0,Berlin,DE,52.5167,13.3833,True,Germany,34.0,3576873
1,Hamburg,DE,53.55,10.0,False,Germany,23.0,1945532
2,Munich,DE,48.1372,11.5755,False,Germany,520.0,1512491
3,Cologne,DE,50.9422,6.9578,False,Germany,37.0,1073096
4,Paris,FR,48.8566,2.3522,True,France,35.0,2102650
5,Nice,FR,43.7034,7.2663,False,France,10.0,348085
6,Rome,IT,41.8931,12.4828,True,Italy,21.0,2860009
7,Milan,IT,45.4669,9.19,False,Italy,120.0,1371498
8,Warsaw,PL,52.2167,21.0333,True,Poland,100.0,1863056
9,Barcelona,ES,41.3825,2.1769,False,Spain,12.0,1620343


#### Save the dataframe for later use

In [9]:
df_cities_ws_cleaned.to_csv("data/df_cities_ws_cleaned.csv", sep=',', index=False, encoding='utf-8')