# Data retriaval from World bank API

https://databank.worldbank.org/source/world-development-indicators

### imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from io import BytesIO
from zipfile import ZipFile
import requests
import xml.etree.ElementTree as et

### Download xml

for example 
- GDP:'NY.GDP.PCAP.CD'
- International tourism, number of arrivals: 'ST.INT.ARVL'
- Unemployment, total (% of total labor force) (modeled ILO estimate): 'SL.UEM.TOTL.ZS'

Method to downlaod data from api and to save it in local csv file. Only data on years >2009.

Data is saved in CSV format, where columns are 'Country or Area', 'Item', 'Year', 'Value'.

In [52]:
def get_WB_indicator(indicator):
    url_base='https://api.worldbank.org/v2/en/indicator/'
    filename = requests.get(url_base+indicator+'?downloadformat=xml').content
    zf = ZipFile(BytesIO(filename), 'r' )
    data=zf.read(zf.namelist()[0])
    dataroot = et.fromstring(data)
    df = pd.DataFrame()
    for m in dataroot[0]:
        if m.tag=='record':
            d={}
            for x in m:
                d[x.attrib['name']]=x.text
                if x.attrib['name']=='Country or Area':
                    d['Code3']=x.attrib['key']
            record = pd.Series(data=d, dtype=str, index=['Country or Area', 'Item', 'Year','Value','Code3'])
            if int(d['Year'])>2009:
                df=df.append(record, ignore_index=True)
    df['Value']=pd.to_numeric(df['Value'],errors='coerce')
    return df

df=get_WB_indicator('ST.INT.ARVL').round({'Value':0})
df=df.append(get_WB_indicator('SL.UEM.TOTL.ZS').round({'Value':1}))
df=df.append(get_WB_indicator('NY.GDP.PCAP.CD').round({'Value':0}))
df.loc[df.Item=='Unemployment, total (% of total labor force) (modeled ILO estimate)','Item']='Unemployment (% of total labor force)'
df.to_csv('WB_statistics.csv', index=False)

In [53]:
df.shape

(8778, 5)

In [54]:
df.head()

Unnamed: 0,Country or Area,Item,Year,Value,Code3
0,Aruba,"International tourism, number of arrivals",2010,1394000.0,ABW
1,Aruba,"International tourism, number of arrivals",2011,1469000.0,ABW
2,Aruba,"International tourism, number of arrivals",2012,1481000.0,ABW
3,Aruba,"International tourism, number of arrivals",2013,1667000.0,ABW
4,Aruba,"International tourism, number of arrivals",2014,1739000.0,ABW


In [55]:
df[df['Country or Area']=='Finland']

Unnamed: 0,Country or Area,Item,Year,Value,Code3
825,Finland,"International tourism, number of arrivals",2010,3670000.0,FIN
826,Finland,"International tourism, number of arrivals",2011,4192000.0,FIN
827,Finland,"International tourism, number of arrivals",2012,4226000.0,FIN
828,Finland,"International tourism, number of arrivals",2013,2797000.0,FIN
829,Finland,"International tourism, number of arrivals",2014,2731000.0,FIN
830,Finland,"International tourism, number of arrivals",2015,2622000.0,FIN
831,Finland,"International tourism, number of arrivals",2016,2789000.0,FIN
832,Finland,"International tourism, number of arrivals",2017,3180000.0,FIN
833,Finland,"International tourism, number of arrivals",2018,3224000.0,FIN
834,Finland,"International tourism, number of arrivals",2019,3290000.0,FIN


In [56]:
df2=df.drop(['Country or Area'], axis=1).rename(columns={'Item':'feature_id','Year':'year','Value':'value', 'Code3':'country_code'})
df2.loc[df2.feature_id=='Unemployment (% of total labor force)','feature_id']=3
df2.loc[df2.feature_id=='International tourism, number of arrivals','feature_id']=2
df2.loc[df2.feature_id=='GDP per capita (current US$)','feature_id']=1
df2['month']=np.nan

In [57]:
df2.head(10)

Unnamed: 0,feature_id,year,value,country_code,month
0,2,2010,1394000.0,ABW,
1,2,2011,1469000.0,ABW,
2,2,2012,1481000.0,ABW,
3,2,2013,1667000.0,ABW,
4,2,2014,1739000.0,ABW,
5,2,2015,1832000.0,ABW,
6,2,2016,1758000.0,ABW,
7,2,2017,1863000.0,ABW,
8,2,2018,1897000.0,ABW,
9,2,2019,1951000.0,ABW,


In [58]:
df2[df2.country_code=='FIN']

Unnamed: 0,feature_id,year,value,country_code,month
825,2,2010,3670000.0,FIN,
826,2,2011,4192000.0,FIN,
827,2,2012,4226000.0,FIN,
828,2,2013,2797000.0,FIN,
829,2,2014,2731000.0,FIN,
830,2,2015,2622000.0,FIN,
831,2,2016,2789000.0,FIN,
832,2,2017,3180000.0,FIN,
833,2,2018,3224000.0,FIN,
834,2,2019,3290000.0,FIN,


In [59]:
df2.to_csv('WB_statistics.csv', index=False)