In [7]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import pandas as pd

#### Dados

In [13]:
def get_url_table(url, columns):
    html = urlopen(url).read()
    soup_idh = BeautifulSoup(html, 'html.parser')

    table = soup_idh.find('table', attrs={'class':'wikitable'})
    table_body_idh = table.find('tbody')


    data_idh = []
    rows = table_body_idh.find_all('tr')
    for row in rows:
        table_data = row.find_all('td')
        table_data = [element.text.strip() for element in table_data]
        
        data_idh.append([element for element in table_data if element and len(element) > 3])


    df = pd.DataFrame(data_idh, columns=columns)
    df = df.dropna()

    return df

In [14]:
def get_body_table(body, columns):
    data = []
    rows = body.find_all('tr')
    for row in rows:
        table_data = row.find_all('td')
        table_data = [element.text.strip() for element in table_data]
        
        data.append([element for element in table_data if element and len(element) > 3])

    
    df = pd.DataFrame(data, columns=columns)

    return df

#### População

In [15]:
url_population = 'https://pt.wikipedia.org/wiki/Lista_de_munic%C3%ADpios_de_Pernambuco_por_popula%C3%A7%C3%A3o'
cols_idh = ['city', 'population']

In [16]:
cities_population = get_url_table(url_population, columns=cols_idh)

In [17]:
cities_population.head()

Unnamed: 0,city,population
1,Recife,1 661 017
2,Jaboatão dos Guararapes,711 330
3,Olinda,393 734
4,Caruaru,369 343
5,Petrolina,359 372


In [18]:
cities_population.shape

(184, 2)

In [19]:
cities_population.to_csv('./data/cities/population.csv', index=False)

#### IDH

In [20]:
url_idh = 'https://pt.wikipedia.org/wiki/Lista_de_munic%C3%ADpios_de_Pernambuco_por_IDH-M'
cols_idh = ['city', 'idh']

In [21]:
cities_idh = get_url_table(url_idh, columns=cols_idh)

In [22]:
cities_idh.head()

Unnamed: 0,city,idh
5,Fernando de Noronha,788
6,Recife,772
7,Olinda,735
8,Paulista,732
9,Jaboatão dos Guararapes,717


In [35]:
cities_population.to_csv('./data/cities/idh.csv', index=False)

#### Microregião

In [36]:
url = 'https://pt.wikipedia.org/wiki/Lista_de_mesorregi%C3%B5es_e_microrregi%C3%B5es_de_Pernambuco'

In [37]:
html = urlopen(url).read()
soup_idh = BeautifulSoup(html, 'html.parser')

tables = soup_idh.find_all('table', attrs={'class':'wikitable'})
bodys = [table.find('tbody') for table in tables]

In [38]:
tables_on_page = ['mesoregion']
page_cols = [['mesoregion', 'microregion'], ['microregion', 'id']]

In [39]:
# Mesorregião de Pernambucano

df_meso_pe = get_body_table(bodys[0], page_cols[0])
df_meso_pe.head()

Unnamed: 0,mesoregion,microregion
0,,
1,Sertão Pernambucano,Araripina
2,Salgueiro,
3,Pajeú,
4,Sertão do Moxotó,


In [40]:
# Mesorregião do Sertão Pernambucano

df_meso0 = get_body_table(bodys[1], page_cols[1])
df_meso0.head()

Unnamed: 0,microregion,id
0,,
1,Araripina,Araripina
2,Bodocó,
3,,
4,Granito,


In [41]:
# Mesorregião do São Francisco Pernambucano

df_meso1 = get_body_table(bodys[2], page_cols[1])
df_meso1.head()

Unnamed: 0,microregion,id
0,,
1,Petrolina,Afrânio
2,Cabrobó,
3,Dormentes,
4,Lagoa Grande,


In [42]:
# Mesorregião do Agreste Pernambucano

df_meso2 = get_body_table(bodys[3], page_cols[1])
df_meso2.head()

Unnamed: 0,microregion,id
0,,
1,Vale do Ipanema,Águas Belas
2,Buíque,
3,Itaíba,
4,Pedra,


In [43]:
# Mesorregião da Mata Pernambucana

df_meso3 = get_body_table(bodys[4], page_cols[1])
df_meso3.head()

Unnamed: 0,microregion,id
0,,
1,Mata Setentrional Pernambucana,Aliança
2,Buenos Aires,
3,Camutanga,
4,Carpina,


In [44]:
# Mesorregião Metropolitana do Recife

df_meso4 = get_body_table(bodys[5], page_cols[1])
df_meso4.head()

Unnamed: 0,microregion,id
0,,
1,Itamaracá,Araçoiaba
2,Igarassu,
3,Ilha de Itamaracá,
4,Itapissuma,
