In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
url = "https://es.wikipedia.org/wiki/Anexo:Ciudades_por_PIB"
response = requests.get(url)
response.status_code # 200 status code means OK!

200

In [3]:
# Scraping the whole table (300 rows) in one single list
PIB_list = []
soup = BeautifulSoup(response.content, "html.parser")

for i in range(2, 7):
    for elem in soup.select("#mw-content-text > div.mw-parser-output > table > tbody>tr>td:nth-child("+str(i)+")"):
        PIB_list.append(elem.get_text().strip())

In [4]:
# Creating a dataframe with the scrapped table
city_PIB = pd.DataFrame({"City" : PIB_list[:300],
                         "Country" : PIB_list[300:600],
                         "Continent" : PIB_list[1200:1500],
                         "Total GDP" : PIB_list[600:900],
                         "GDP per capita" : PIB_list[900:1200]
                        })

In [5]:
# Cleaning numeric data
city_PIB["Total GDP"] = pd.to_numeric(city_PIB["Total GDP"].str.replace('\\xa0',  ''))
city_PIB["GDP per capita"] = pd.to_numeric(city_PIB["GDP per capita"].str.replace('\\xa0',  ''))

In [11]:
city_PIB

Unnamed: 0,City,Country,Continent,Total GDP,GDP per capita
0,Tokio,Japón,Asia,1617000,68776
1,Nueva York,Estados Unidos,América,1403463,69915
2,Los Ángeles,Estados Unidos,América,860452,65082
3,Seúl,Corea del Sur,Asia,845906,34355
4,Londres,Reino Unido,Europa,835658,57157
...,...,...,...,...,...
295,El Paso,Estados Unidos,América,32659,38737
296,Edimburgo,Reino Unido,Europa,32497,58437
297,Winnipeg,Canadá,América,32478,41719
298,Alejandría,Egipto,África,32400,7100


In [6]:
url = "https://datosmacro.expansion.com/pib"
response = requests.get(url)
response.status_code # 200 status code means OK!


200

In [7]:
# Scraping country PIB website
PIB_countries = []
countries = []
soup = BeautifulSoup(response.content, "html.parser")

for elem in soup.select("#tbPC > tbody > tr"):
    countries.append(elem.select_one("a").get_text().strip())
    PIB_countries.append(elem.select_one("td:nth-child(5)").get_text().strip())

In [8]:
# Creating a dataframe with the scrapped table
country_PIB = pd.DataFrame({"Country" : countries,
                          "Country GDP per capita" : PIB_countries
                        })

In [9]:
# Cleaning data
country_PIB["Country"] = country_PIB["Country"].str.replace('\W*\Z',  '')
country_PIB["Country GDP per capita"] = country_PIB["Country GDP per capita"].str.replace('.','')
country_PIB["Country GDP per capita"] = country_PIB["Country GDP per capita"].str.replace('$','')

In [10]:
country_PIB["Country GDP per capita"] = pd.to_numeric(country_PIB["Country GDP per capita"])

In [225]:
# Dropping duplicates and null values 
country_PIB = country_PIB.dropna()
country_PIB = country_PIB.drop_duplicates()

In [226]:
country_PIB ["Country GDP per capita"] = country_PIB["Country GDP per capita"].astype(int)

In [227]:
# Exporting the dataframes
city_PIB.to_csv("../data/cities_PIB.csv")
country_PIB.to_csv("../data/countries_PIB.csv")