# Chapter 4 - World Development Indicator
World Development Indicators (WDI) is the World Bank’s premier compilation of cross-country comparable data on development.

**References**
- http://wdi.worldbank.org/
- https://wbdata.readthedocs.io/en/stable/
- https://datahelpdesk.worldbank.org/knowledgebase/topics/125589-developer-information
- https://databank.worldbank.org/source/world-development-indicators

**Data Source**
- https://datacatalog.worldbank.org/dataset/world-development-indicators
- https://unstats.un.org/unsd/methodology/m49/overview

In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objs as go
import plotly.io as pio

## Step 1 - Read Data

In [2]:
df = pd.read_csv("../data/WDIData.csv")
df.shape

(380160, 66)

In [139]:
df.sample(2)

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,Unnamed: 65
268176,Netherlands,NLD,"Educational attainment, at least completed sho...",SE.TER.CUAT.ST.MA.ZS,,,,,,,...,32.047642,32.582878,32.589989,33.105862,33.28109,,34.46801,,,
128423,Colombia,COL,CPIA business regulatory environment rating (1...,IQ.CPA.BREG.XQ,,,,,,,...,,,,,,,,,,


**Drop the last unuseful column**

In [3]:
df.drop(columns=df.columns[-1], inplace=True)
df.sample(1)

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
69368,Albania,ALB,Coverage of social insurance programs in riche...,per_si_allsi.cov_q5_tot,,,,,,,...,,34.619776,,,,,,,,


## Step 2 - Add additional variables for regions and sub-regions

In [4]:
df_country = pd.read_csv("../data/country_regions.csv")
df_country.head(2)

Unnamed: 0,Global Code,Global Name,Region Code,Region Name,Sub-region Code,Sub-region Name,Intermediate Region Code,Intermediate Region Name,Country or Area,M49 Code,ISO-alpha2 Code,ISO-alpha3 Code,Least Developed Countries (LDC),Land Locked Developing Countries (LLDC),Small Island Developing States (SIDS),Developed / Developing Countries
0,1,World,2.0,Africa,15.0,Northern Africa,,,Algeria,12,DZ,DZA,,,,Developing
1,1,World,2.0,Africa,15.0,Northern Africa,,,Egypt,818,EG,EGY,,,,Developing


In [43]:
df_country = df_country[["ISO-alpha3 Code", "Region Name", "Sub-region Name"]]
df_country.sample(2)

Unnamed: 0,ISO-alpha3 Code,Region Name,Sub-region Name
236,MNP,Oceania,Micronesia
99,BRA,Americas,Latin America and the Caribbean


In [48]:
df_merged= pd.merge(df_country, 
                    df,
                    right_on=["Country Code"],
                    left_on=["ISO-alpha3 Code"],
                    how="right"
)

df_merged.sample(5)

Unnamed: 0,ISO-alpha3 Code,Region Name,Sub-region Name,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
122841,TCD,Africa,Sub-Saharan Africa,Chad,TCD,Firms visited or required meetings with tax of...,IC.FRM.METG.ZS,,,,...,,,,,,,,82.0,,
181587,GUM,Oceania,Micronesia,Guam,GUM,Chemicals (% of value added in manufacturing),NV.MNF.CHEM.ZS.UN,,,,...,,,,,,,,,,
36684,,,,Low & middle income,LMY,"Labor force with basic education, male (% of m...",SL.TLF.BASC.MA.ZS,,,,...,,,,,,,,,,
141394,CYP,Asia,Western Asia,Cyprus,CYP,CPIA property rights and rule-based governance...,IQ.CPA.PROP.XQ,,,,...,,,,,,,,,,
204704,ISR,Asia,Western Asia,Israel,ISR,"Computer, communications and other services (%...",TX.VAL.OTHR.ZS.WT,16.853933,15.929204,,...,67.151458,68.59742,71.239965,72.246718,73.527325,76.175067,75.213856,77.032892,78.429023,


In [51]:
df_merged.drop(columns=["ISO-alpha3 Code"], inplace=True)
df_merged.sample(2)

Unnamed: 0,Region Name,Sub-region Name,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
272305,Americas,Latin America and the Caribbean,Nicaragua,NIC,"Charges for the use of intellectual property, ...",BM.GSR.ROYL.CD,,,,,...,800000.0,800000.0,800000.0,1200000.0,1200000.0,1900000.0,2000000.0,1900000.0,1100000.0,
288085,Americas,Latin America and the Caribbean,Paraguay,PRY,Annualized average growth rate in per capita r...,SI.SPR.PCAP.ZG,,,,,...,,,,,,,,0.34,,


In [57]:
df_merged.drop(columns=[str(i) for i in range(1960, 1991)], inplace=True)
df_merged.sample(2)

KeyError: "['1960' '1961' '1962' '1963' '1964' '1965' '1966' '1967' '1968' '1969'\n '1970' '1971' '1972' '1973' '1974' '1975' '1976' '1977' '1978' '1979'\n '1980' '1981' '1982' '1983' '1984' '1985' '1986' '1987' '1988' '1989'\n '1990'] not found in axis"

## Step 3 - Melt the dataframe for easy analysis

In [55]:
df_melted = df_merged.melt(id_vars=df_merged.columns[:6], 
                          value_vars=df_merged.columns[6:], 
                          var_name="Year", 
                          value_name='Indicator Value')

df_melted.sample(5)

Unnamed: 0,Region Name,Sub-region Name,Country Name,Country Code,Indicator Name,Indicator Code,Year,Indicator Value
3326121,Oceania,Micronesia,Palau,PLW,"Pupil-teacher ratio, preprimary",SE.PRE.ENRL.TC.ZS,1999,
7609456,,,East Asia & Pacific,EAS,GNI growth (annual %),NY.GNP.MKTP.KD.ZG,2011,4.433255
7685899,Oceania,Australia and New Zealand,Australia,AUS,"Industrial design applications, resident, by c...",IP.IDS.RSCT,2011,2664.0
9187349,,,Sub-Saharan Africa (IDA & IBRD countries),TSS,"Child employment in agriculture, female (% of ...",SL.AGR.0714.FE.ZS,2015,
1610919,Asia,Southern Asia,Bangladesh,BGD,"Population ages 00-04, female (% of female pop...",SP.POP.0004.FE.5Y,1995,14.10921


In [56]:
df_melted["Year"].unique()

array(['1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998',
       '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006',
       '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014',
       '2015', '2016', '2017', '2018', '2019', '2020'], dtype=object)

In [None]:
df_melted.query("Year>'1989'").query("Year<'2019'").to_csv("world_dev_indicators.csv")

**Find out how many unique indicators**

In [None]:
df["Indicator Code"].nunique()

**1,440 indicators are a lot. Save them to a file for reference**

In [42]:
df_indicator = df[["Indicator Name", "Indicator Code"]].drop_duplicates()
df_indicator.to_csv("indicator_code.csv", index=False)

**Eye ball the file and pick the three indicators of interest:**
- SP.DYN.LE00.IN
    - Life expectancy at birth, total (years)
- NY.GDP.PCAP.PP.CD
    - GDP per capita, PPP (current international $)
- SP.POP.TOTL
    - Population, total 

In [33]:
label_dict = pd.Series(df_indicator["Indicator Name"].values,index=df_indicator["Indicator Code"]).to_dict()


TypeError: unhashable type: 'slice'

In [8]:
indicator_list = ["SP.DYN.LE00.IN","NY.GDP.PCAP.PP.CD","SP.POP.TOTL"]
df_plot = df_melted[df_melted["Indicator Code"].isin(indicator_list)]
df_plot.shape

(48312, 10)

In [201]:
df_plot.sample(5)

Unnamed: 0,ISO-alpha3 Code,Region Name,Sub-region Name,Country or Area,Country Name,Country Code,Indicator Name,Indicator Code,Year,Indicator Value
2873277,KAZ,Asia,Central Asia,Kazakhstan,Kazakhstan,KAZ,"GDP per capita, PPP (current international $)",NY.GDP.PCAP.PP.CD,1967,
17585757,BMU,Americas,Northern America,Bermuda,Bermuda,BMU,"GDP per capita, PPP (current international $)",NY.GDP.PCAP.PP.CD,2006,71580.622072
10135420,MEX,Americas,Latin America and the Caribbean,Mexico,Mexico,MEX,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,1986,69.232
2140,,,,,Caribbean small states,CSS,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,1960,62.746296
4489180,SAU,Asia,Western Asia,Saudi Arabia,Saudi Arabia,SAU,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,1971,53.771


In [9]:
indicator_columns = ("Indicator Name", "Indicator Code", "Indicator Value")
index_columns = [column for column in df_plot.columns if column not in indicator_columns]
index_columns

['ISO-alpha3 Code',
 'Region Name',
 'Sub-region Name',
 'Country or Area',
 'Country Name',
 'Country Code',
 'Year']

In [10]:
df_plot = df_plot.pivot(index=index_columns, columns=["Indicator Code"], values="Indicator Value")
df_plot.sample(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Indicator Code,NY.GDP.PCAP.PP.CD,SP.DYN.LE00.IN,SP.POP.TOTL
ISO-alpha3 Code,Region Name,Sub-region Name,Country or Area,Country Name,Country Code,Year,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
,,,,European Union,EUU,1974,,71.726335,395949400.0
,,,,OECD members,OED,1995,19923.540035,75.524422,1153301000.0
SOM,Africa,Sub-Saharan Africa,Somalia,Somalia,SOM,2018,,57.068,15008150.0
TZA,Africa,Sub-Saharan Africa,United Republic of Tanzania,Tanzania,TZA,1980,,50.288,18538260.0
GHA,Africa,Sub-Saharan Africa,Ghana,Ghana,GHA,1966,,48.155,7941412.0


In [11]:
df_plot.reset_index(inplace=True)
df_plot.sample(5)

Indicator Code,ISO-alpha3 Code,Region Name,Sub-region Name,Country or Area,Country Name,Country Code,Year,NY.GDP.PCAP.PP.CD,SP.DYN.LE00.IN,SP.POP.TOTL
14422,TCD,Africa,Sub-Saharan Africa,Chad,Chad,TCD,1986,,46.397,5247283.0
570,,,,,Europe & Central Asia,ECS,1981,,70.432933,798559799.0
9510,KWT,Asia,Western Asia,Kuwait,Kuwait,KWT,2015,47230.623192,75.13,3835591.0
11919,NRU,Oceania,Micronesia,Nauru,Nauru,NRU,1984,,,8328.0
1474,,,,,Latin America & Caribbean,LCN,1970,,60.435448,285860560.0


In [12]:
df_plot = df_plot[df_plot["Sub-region Name"].notnull()]
df_plot = df_plot[df_plot["Region Name"].notnull()]
df_plot = df_plot[df_plot["SP.POP.TOTL"].notnull()]

In [230]:
df_plot.sample(5)

Indicator Code,ISO-alpha3 Code,Region Name,Sub-region Name,Country or Area,Country Name,Country Code,Year,NY.GDP.PCAP.PP.CD,SP.DYN.LE00.IN,SP.POP.TOTL
15675,VIR,Americas,Latin America and the Caribbean,United States Virgin Islands,Virgin Islands (U.S.),VIR,2019,,,106631.0
14816,TON,Oceania,Polynesia,Tonga,Tonga,TON,2014,5639.558353,70.427,101028.0
8043,GUY,Americas,Latin America and the Caribbean,Guyana,Guyana,GUY,2012,10479.627598,68.575,755399.0
12149,PAN,Americas,Latin America and the Caribbean,Panama,Panama,PAN,1970,,65.532,1519285.0
3432,ARE,Asia,Western Asia,United Arab Emirates,United Arab Emirates,ARE,1976,,65.916,637922.0


In [13]:
df_scatter = df_plot.query("Year>'1989'").query("Year<'2019'")
df_scatter["Year"].unique()

array(['1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997',
       '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005',
       '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013',
       '2014', '2015', '2016', '2017', '2018'], dtype=object)

In [242]:
df_scatter["NY.GDP.PCAP.PP.CD"].min()

285.586899959856

In [243]:
df_scatter["NY.GDP.PCAP.PP.CD"].max()

141634.96180221302

In [246]:
df_scatter[df_scatter["NY.GDP.PCAP.PP.CD"] > 140000]

Indicator Code,ISO-alpha3 Code,Region Name,Sub-region Name,Country or Area,Country Name,Country Code,Year,NY.GDP.PCAP.PP.CD,SP.DYN.LE00.IN,SP.POP.TOTL
12923,QAT,Asia,Western Asia,Qatar,Qatar,QAT,2012,141634.961802,79.399,2196074.0


In [244]:
df_scatter["SP.DYN.LE00.IN"].min()

26.171999999999997

In [37]:
df_indicator2 = df_indicator[df_indicator["Indicator Code"].isin(indicator_list)]

In [39]:
label_dict = pd.Series(df_indicator2["Indicator Name"].values,
                       index=df_indicator2["Indicator Code"]).to_dict()
label_dict

{'NY.GDP.PCAP.PP.CD': 'GDP per capita, PPP (current international $)',
 'SP.DYN.LE00.IN': 'Life expectancy at birth, total (years)',
 'SP.POP.TOTL': 'Population, total'}

In [40]:
fig = px.scatter(df_scatter, 
                 x="NY.GDP.PCAP.PP.CD", 
                 y="SP.DYN.LE00.IN",
                 labels=label_dict,
                 width=1000, 
                 height=600,
                 color="Sub-region Name",
         #        color="Region Name",
                 size="SP.POP.TOTL",
          #       log_x=True, 
                 size_max=100,
                 hover_name="Country Name",
                 animation_group="Country Code",
                 animation_frame="Year",
             #    range_x=[250,150000], 
             #    range_y=[25,100]
              #  hover_data=["Country Code", "Country Name", "Sub-region Name"]
)

fig.write_html("scatter.html")
fig.show()

In [226]:
df_gap = px.data.gapminder()
fig = px.scatter(df_gap, x="gdpPercap", y="lifeExp", animation_frame="year", animation_group="country",
           size="pop", color="continent", hover_name="country",
           log_x=True, size_max=55, range_x=[100,100000], range_y=[25,90])

#fig["layout"].pop("updatemenus") # optional, drop animation buttons
fig.show()
fig.write_html("gapminder.html")

In [222]:
fig = px.choropleth(df_plot.query("Year=='2018'"),  
                    locations='Country Code', 
                    color='SP.POP.TOTL',
                    color_continuous_scale="Viridis",
                    scope="world",
                    hover_name="Country Name"
#                    locationmode = 'USA-states',
#                    labels={'ST':'State'}
)

#fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.write_html("choropleth.html")
fig.show()

In [228]:
df_gap.sample(5)

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap,iso_alpha,iso_num
509,Ethiopia,Africa,1977,44.51,34617799,556.808383,ETH,231
1124,Niger,Africa,1992,47.391,8392818,581.182725,NER,562
462,Egypt,Africa,1982,56.006,45681811,3503.729636,EGY,818
435,Dominican Republic,Americas,1967,56.751,4049146,1653.723003,DOM,214
642,Haiti,Americas,1982,51.461,5198399,2011.159549,HTI,332
