In [1]:
# Your code here
from bs4 import BeautifulSoup # this module helps in web scrapping.
import requests  # this module helps us to download a webpage
import pandas as pd
import numpy as np

In [2]:
def get_soup(url):
    #Define your User-Agent (important for Wikipedia to identify you)
    USER_AGENT = "MyEducationalBot/1.0 (contact: syn@aungthuya.dev)"
    headers = {'User-Agent': USER_AGENT}
    
    #Scrape 
    data  = requests.get(url, headers=headers).text 
    
    #Create a Soup Object
    soup = BeautifulSoup(data,"html.parser")
    return soup

In [3]:
def get_table(soup, index):
    all_tables = soup.find_all('table')
    print(f"{len(all_tables)} tables found on the page.")
    if len(all_tables) > index:
        table = all_tables[index]
    else:
        print(f"{len(all_tables)} tables found on the page.")

    return table

In [4]:
def clean_text(cell):
    text = cell.text
    text = text.replace('\n', '').replace(',', '')
    return text.strip()

In [5]:
def get_data(table, cols):
    data = []
    for tr in table.find_all('tr'):
        cells = tr.find_all('td')
        if (len(cells) < 4):
            continue
        row = []
        for i in cols:
            row.append(clean_text(cells[i]))
        data.append(row)
    return data
    

### Get LookUp Table
The tables from wiki don't have state acronyms column.
The datasets in olsit recorded state name in acronyms.

In [6]:
soup = get_soup("https://brazil-help.com/brazilian_states.htm")

In [7]:
table = get_table(soup, 27)
data = get_data(table, [0, 1])

34 tables found on the page.


In [8]:
lookup = pd.DataFrame(data, columns = ["state", "name"])

In [9]:
lookup.drop([0, 1], axis = 0, inplace = True)

In [10]:
lookup['name'] = lookup['name'].replace('SãoPaulo', 'São Paulo')
lookup['name'] = lookup['name'].replace('MinasGerais', 'Minas Gerais')
lookup['name'] = lookup['name'].replace('Riode Janeiro', 'Rio de Janeiro')
lookup['name'] = lookup['name'].replace('RioGrande do Sul', 'Rio Grande do Sul')
lookup['name'] = lookup['name'].replace('SantaCatarina', 'Santa Catarina')
lookup['name'] = lookup['name'].replace('EspíritoSanto', 'Espírito Santo')
lookup['name'] = lookup['name'].replace('MatoGrosso', 'Mato Grosso')
lookup['name'] = lookup['name'].replace('RioGrande do Norte', 'Rio Grande do Norte')
lookup['name'] = lookup['name'].replace('MatoGrossodo Sul', 'Mato Grosso do Sul')
lookup['name'] = lookup['name'].replace('DistritoFederal', 'Distrito Federal')

### Get Population by Municipalities 
This dataset will be use for Detail Analysis

remark - `Not all cities are listed`. Only the  most populous cities.

In [11]:
soup = get_soup("https://en.wikipedia.org/wiki/List_of_municipalities_in_Brazil_by_population")

In [12]:
#Get Population Data
table = get_table(soup, 1)
data = get_data(table, [0, 1, 3])

9 tables found on the page.


In [13]:
df = pd.DataFrame(data, columns=["city", "state", "population"])
df.head()

Unnamed: 0,city,state,population
0,São Paulo,São Paulo,11451245
1,Rio de Janeiro,Rio de Janeiro,6211423
2,Brasília,Distrito Federal,2817068
3,Fortaleza,Ceará,2428678
4,Salvador,Bahia,2418005


In [14]:
merge = pd.merge(df, lookup, left_on = "state", right_on = "name", how = "outer")

In [15]:
merge.isnull().sum()

city          0
state_x       0
population    0
state_y       0
name          0
dtype: int64

In [16]:
merge.head()

Unnamed: 0,city,state_x,population,state_y,name
0,Rio Branco,Acre,364756,AC,Acre
1,Maceió,Alagoas,957916,AL,Alagoas
2,Arapiraca,Alagoas,234696,AL,Alagoas
3,Macapá,Amapá,442933,AP,Amapá
4,Santana,Amapá,107373,AP,Amapá


In [17]:
merge.drop("state_x", axis = 1, inplace = True)

In [18]:
merge.columns = ["city", "population", "state_id", "state_name"]
merge

Unnamed: 0,city,population,state_id,state_name
0,Rio Branco,364756,AC,Acre
1,Maceió,957916,AL,Alagoas
2,Arapiraca,234696,AL,Alagoas
3,Macapá,442933,AP,Amapá
4,Santana,107373,AP,Amapá
...,...,...,...,...
333,Leme,98161,SP,São Paulo
334,Votuporanga,96634,SP,São Paulo
335,Caçapava,96202,SP,São Paulo
336,Palmas,302692,TO,Tocantins


In [19]:
merge.to_csv('DataSet2/population_cities_2022.csv', index=False)

### Get Population by States 
This dataset will be use for Overall Analysis

remark - This is the correct number of population in state.

In [20]:
soup = get_soup("https://en.wikipedia.org/wiki/2022_Brazilian_census")

In [21]:
#Get Population Data
table = get_table(soup, 2)
data = get_data(table, [1, 3])

5 tables found on the page.


In [22]:
df = pd.DataFrame(data, columns=["state", "population"])
df.head()

Unnamed: 0,state,population
0,São Paulo,44411238
1,Minas Gerais,20539989
2,Rio de Janeiro,16055174
3,Bahia,14141626
4,Paraná,11444380


In [23]:
merge = pd.merge(df, lookup, left_on = "state", right_on = "name", how = "outer")

In [24]:
merge.head()

Unnamed: 0,state_x,population,state_y,name
0,Acre,830018,AC,Acre
1,Alagoas,3127683,AL,Alagoas
2,Amapá,733759,AP,Amapá
3,Amazonas,3941613,AM,Amazonas
4,Bahia,14141626,BA,Bahia


In [25]:
merge.isnull().sum()

state_x       0
population    0
state_y       0
name          0
dtype: int64

In [26]:
merge = merge.drop("state_x", axis=1)
merge.head()

Unnamed: 0,population,state_y,name
0,830018,AC,Acre
1,3127683,AL,Alagoas
2,733759,AP,Amapá
3,3941613,AM,Amazonas
4,14141626,BA,Bahia


In [27]:
merge.columns = ["population", "state_id", "state_name"]
merge.head()

Unnamed: 0,population,state_id,state_name
0,830018,AC,Acre
1,3127683,AL,Alagoas
2,733759,AP,Amapá
3,3941613,AM,Amazonas
4,14141626,BA,Bahia


In [28]:
merge.to_csv('DataSet2/population_states_2022.csv', index=False)