In [316]:
import pandas as pd
import requests
import io
import os


In [317]:
def download_worldbank(indicator, countries, date_start, date_end):
    url_base = 'http://api.worldbank.org/v2/'  # Base URL for the World Bank API
    country_codes = ';'.join(countries)  # Combine country codes into a string
    url = url_base + f'country/{country_codes}/indicator/{indicator}?date={date_start}:{date_end}&per_page=30000' #create the url with start and end date.
    url = url_base + f'country/{country_codes}/indicator/{indicator}?per_page=30000' # This line overrides the previous one. It will ignore start/end date.

    response = requests.get(url)  # Download data from the URL
    df = pd.read_xml(response.content)  # Convert the downloaded data to a table
    return df  # Return the table

In [318]:
#'SL.AGR.EMPL.ZS','SP.RUR.TOTL.ZG','SI.POV.NAHC', 'SI.POV.GINI','IT.NET.SECR.P6','IT.CEL.SETS.P2','FP.CPI.TOTL','FX.OWN.TOTL.ZS','EG.ELC.ACCS.ZS','EG.ELC.ACCS.RU.ZS','BX.TRF.PWKR.DT.GD.ZS'
ag_emp = download_worldbank(
    indicator = 'SL.AGR.EMPL.ZS',
    countries = ['BGD', 'IND','VNM','LAO','IND','THA','KHM','MYS','SGP','MMR','PHL','BRN'],
    date_start = '2020',
    date_end = '2023'
)

In [319]:
ag_emp.head(8)

Unnamed: 0,indicator,country,countryiso3code,date,value,unit,obs_status,decimal
0,Employment in agriculture (% of total employme...,Bangladesh,BGD,2024,,,,0
1,Employment in agriculture (% of total employme...,Bangladesh,BGD,2023,35.274072,,,0
2,Employment in agriculture (% of total employme...,Bangladesh,BGD,2022,35.662164,,,0
3,Employment in agriculture (% of total employme...,Bangladesh,BGD,2021,37.306117,,,0
4,Employment in agriculture (% of total employme...,Bangladesh,BGD,2020,37.971569,,,0
5,Employment in agriculture (% of total employme...,Bangladesh,BGD,2019,38.464793,,,0
6,Employment in agriculture (% of total employme...,Bangladesh,BGD,2018,39.552129,,,0
7,Employment in agriculture (% of total employme...,Bangladesh,BGD,2017,40.597307,,,0


In [320]:
def clean_wb(data):
    data = data.drop(["country","unit","decimal","obs_status","indicator"], axis=1)
    data = data.rename({"countryiso3code":'country',"date":'year'}, axis=1)
    return data


ag_emp = clean_wb(ag_emp)

ag_emp = ag_emp.rename({'value':"Employment in Agr. (percent of total employment)"},axis=1)

ag_emp.head()

Unnamed: 0,country,year,Employment in Agr. (percent of total employment)
0,BGD,2024,
1,BGD,2023,35.274072
2,BGD,2022,35.662164
3,BGD,2021,37.306117
4,BGD,2020,37.971569


In [321]:
#'SL.AGR.EMPL.ZS','SP.RUR.TOTL.ZG','SI.POV.NAHC', 'SI.POV.GINI','IT.NET.SECR.P6','IT.CEL.SETS.P2','FP.CPI.TOTL','FX.OWN.TOTL.ZS','EG.ELC.ACCS.ZS','EG.ELC.ACCS.RU.ZS','BX.TRF.PWKR.DT.GD.ZS'
rural_growth = download_worldbank(
    indicator = 'SP.RUR.TOTL.ZG',
    countries = ['BGD', 'IND','VNM','LAO','IND','THA','KHM','MYS','SGP','MMR','PHL','BRN'],
    date_start = '2021',
    date_end = '2023'
)

rural_growth = clean_wb(rural_growth)
rural_growth = rural_growth.rename({'value':"Rural Pop. Growth (annual percent)"},axis=1)

rural_growth.head()

Unnamed: 0,country,year,Rural Pop. Growth (annual percent)
0,BGD,2024,
1,BGD,2023,-0.050253
2,BGD,2022,-0.236671
3,BGD,2021,-0.436696
4,BGD,2020,-0.404684


In [322]:
rural_growth.year.dtype

dtype('int64')

In [323]:
def change_date(data):
    data.year = pd.PeriodIndex(data.year, freq = 'Q').to_timestamp()
    return data

In [324]:
rural_growth = change_date(rural_growth)
ag_emp = change_date(ag_emp)

In [325]:
rural_growth.year.dtype

rural_growth.head()

Unnamed: 0,country,year,Rural Pop. Growth (annual percent)
0,BGD,2024-01-01,
1,BGD,2023-01-01,-0.050253
2,BGD,2022-01-01,-0.236671
3,BGD,2021-01-01,-0.436696
4,BGD,2020-01-01,-0.404684


In [326]:
ag_emp.head()

Unnamed: 0,country,year,Employment in Agr. (percent of total employment)
0,BGD,2024-01-01,
1,BGD,2023-01-01,35.274072
2,BGD,2022-01-01,35.662164
3,BGD,2021-01-01,37.306117
4,BGD,2020-01-01,37.971569


In [327]:
ag_emp = ag_emp.set_index(['country','year'])
ag_emp.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Employment in Agr. (percent of total employment)
country,year,Unnamed: 2_level_1
BGD,2024-01-01,
BGD,2023-01-01,35.274072
BGD,2022-01-01,35.662164
BGD,2021-01-01,37.306117
BGD,2020-01-01,37.971569


In [328]:
rural_growth = rural_growth.set_index(['country','year'])
rural_growth.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Rural Pop. Growth (annual percent)
country,year,Unnamed: 2_level_1
BGD,2024-01-01,
BGD,2023-01-01,-0.050253
BGD,2022-01-01,-0.236671
BGD,2021-01-01,-0.436696
BGD,2020-01-01,-0.404684


In [329]:
wb_merged = pd.merge(
    rural_growth,
    ag_emp,
    right_index= True,
    left_index = True,
    how = 'inner'
)

wb_merged.head()

wb_merged.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,Rural Pop. Growth (annual percent),Employment in Agr. (percent of total employment)
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1
VNM,1964-01-01,2.250111,
VNM,1963-01-01,2.35434,
VNM,1962-01-01,2.199883,
VNM,1961-01-01,2.271694,
VNM,1960-01-01,,


In [330]:
#os.makedirs("data/processed/", exist_ok=True)
wb_merged.to_csv("../data/processed/wb_merged.csv")